Index: test/CodeGen/aarch64-neon-2velem.c
===================================================================
--- test/CodeGen/aarch64-neon-2velem.c
+++ test/CodeGen/aarch64-neon-2velem.c
@@ -1,2452 +1,5011 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16
   return vmla_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16
   return vmlaq_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32
   return vmla_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32
   return vmlaq_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16
   return vmla_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16
   return vmlaq_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32
   return vmla_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32
   return vmlaq_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16
   return vmls_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16
   return vmlsq_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32
   return vmls_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32
   return vmlsq_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16
   return vmls_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16
   return vmlsq_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32
   return vmls_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32
   return vmlsq_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16
   return vmul_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16
   return vmulq_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32
   return vmul_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32
   return vmulq_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16
   return vmul_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16
   return vmulq_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32
   return vmul_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32
   return vmulq_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16
   return vmul_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16
   return vmulq_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32
   return vmul_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32
   return vmulq_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16
   return vmul_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16
   return vmulq_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32
   return vmul_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32
   return vmulq_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32
   return vfma_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32
   return vfmaq_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32
   return vfma_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32
   return vfmaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32
   return vfms_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32
   return vfmsq_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32
   return vfms_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32
   return vfmsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA]]1)
+// CHECK:   ret <2 x double> [[FMLA]]2
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f64
   return vfmaq_lane_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64
   return vfmaq_laneq_f64(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA]]1)
+// CHECK:   ret <2 x double> [[FMLA]]2
 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f64
   return vfmsq_lane_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64
   return vfmsq_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmas_laneq_f32
   return vfmas_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsd_lane_f64
   return vfmsd_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+\.d\[0\]|fmsub d[0-9]+, d[0-9]+, d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmss_laneq_f32
   return vfmss_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsd_laneq_f64
   return vfmsd_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16
   return vmlal_lane_s16(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32
   return vmlal_lane_s32(a, b, v, 1);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16
   return vmlal_laneq_s16(a, b, v, 7);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32
   return vmlal_laneq_s32(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16
   return vmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32
   return vmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16
   return vmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32
   return vmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16
   return vmlsl_lane_s16(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32
   return vmlsl_lane_s32(a, b, v, 1);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16
   return vmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32
   return vmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16
   return vmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32
   return vmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16
   return vmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32
   return vmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16
   return vmlal_lane_u16(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32
   return vmlal_lane_u32(a, b, v, 1);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16
   return vmlal_laneq_u16(a, b, v, 7);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32
   return vmlal_laneq_u32(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16
   return vmlal_high_lane_u16(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32
   return vmlal_high_lane_u32(a, b, v, 1);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16
   return vmlal_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32
   return vmlal_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16
   return vmlsl_lane_u16(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32
   return vmlsl_lane_u32(a, b, v, 1);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16
   return vmlsl_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32
   return vmlsl_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16
   return vmlsl_high_lane_u16(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32
   return vmlsl_high_lane_u32(a, b, v, 1);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16
   return vmlsl_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32
   return vmlsl_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16
   return vmull_lane_s16(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32
   return vmull_lane_s32(a, v, 1);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16
   return vmull_lane_u16(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32
   return vmull_lane_u32(a, v, 1);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16
   return vmull_high_lane_s16(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32
   return vmull_high_lane_s32(a, v, 1);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16
   return vmull_high_lane_u16(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32
   return vmull_high_lane_u32(a, v, 1);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16
   return vmull_laneq_s16(a, v, 7);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32
   return vmull_laneq_s32(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16
   return vmull_laneq_u16(a, v, 7);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32
   return vmull_laneq_u32(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16
   return vmull_high_laneq_s16(a, v, 7);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32
   return vmull_high_laneq_s32(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16
   return vmull_high_laneq_u16(a, v, 7);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32
   return vmull_high_laneq_u32(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16
   return vqdmlal_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32
   return vqdmlal_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16
   return vqdmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32
   return vqdmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16
   return vqdmlsl_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32
   return vqdmlsl_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16
   return vqdmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32
   return vqdmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16
   return vqdmull_lane_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32
   return vqdmull_lane_s32(a, v, 1);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16
   return vqdmull_laneq_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32
   return vqdmull_laneq_s32(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16
   return vqdmull_high_lane_s16(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32
   return vqdmull_high_lane_s32(a, v, 1);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16
   return vqdmull_high_laneq_s16(a, v, 7);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32
   return vqdmull_high_laneq_s32(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16
   return vqdmulh_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16
   return vqdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32
   return vqdmulh_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32
   return vqdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16
   return vqrdmulh_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16
   return vqrdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32
   return vqrdmulh_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32
   return vqrdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32
   return vmul_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
 
+// CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmul_lane_f64
   return vmul_lane_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+\.d\[0\]|d[0-9]+}}
 }
 
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32
   return vmulq_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f64
   return vmulq_lane_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32
   return vmul_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64
   return vmul_laneq_f64(a, v, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32
   return vmulq_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64
   return vmulq_laneq_f64(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32
   return vmulx_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32
   return vmulxq_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32
   return vmulx_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32
   return vmulxq_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64
   return vmulxq_laneq_f64(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16_0
   return vmla_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16_0
   return vmlaq_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32_0
   return vmla_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32_0
   return vmlaq_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16_0
   return vmla_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16_0
   return vmlaq_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32_0
   return vmla_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32_0
   return vmlaq_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16_0
   return vmls_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16_0
   return vmlsq_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32_0
   return vmls_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32_0
   return vmlsq_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16_0
   return vmls_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16_0
   return vmlsq_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32_0
   return vmls_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32_0
   return vmlsq_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16_0
   return vmul_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16_0
   return vmulq_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32_0
   return vmul_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32_0
   return vmulq_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16_0
   return vmul_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16_0
   return vmulq_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32_0
   return vmul_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32_0
   return vmulq_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16_0
   return vmul_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16_0
   return vmulq_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32_0
   return vmul_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32_0
   return vmulq_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16_0
   return vmul_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16_0
   return vmulq_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32_0
   return vmul_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32_0
   return vmulq_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32_0
   return vfma_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32_0
   return vfmaq_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32_0
   return vfma_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32_0
   return vfmaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32_0
   return vfms_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32_0
   return vfmsq_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32_0
   return vfms_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32_0
   return vfmsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64_0
   return vfmaq_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64_0
   return vfmsq_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16_0
   return vmlal_lane_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32_0
   return vmlal_lane_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16_0
   return vmlal_laneq_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32_0
   return vmlal_laneq_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16_0
   return vmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32_0
   return vmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16_0
   return vmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32_0
   return vmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16_0
   return vmlsl_lane_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32_0
   return vmlsl_lane_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16_0
   return vmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32_0
   return vmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16_0
   return vmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32_0
   return vmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16_0
   return vmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32_0
   return vmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16_0
   return vmlal_lane_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32_0
   return vmlal_lane_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16_0
   return vmlal_laneq_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32_0
   return vmlal_laneq_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16_0
   return vmlal_high_lane_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32_0
   return vmlal_high_lane_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16_0
   return vmlal_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32_0
   return vmlal_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16_0
   return vmlsl_lane_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32_0
   return vmlsl_lane_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16_0
   return vmlsl_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32_0
   return vmlsl_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16_0
   return vmlsl_high_lane_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32_0
   return vmlsl_high_lane_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16_0
   return vmlsl_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32_0
   return vmlsl_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16_0
   return vmull_lane_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32_0
   return vmull_lane_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16_0
   return vmull_lane_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32_0
   return vmull_lane_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16_0
   return vmull_high_lane_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32_0
   return vmull_high_lane_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16_0
   return vmull_high_lane_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32_0
   return vmull_high_lane_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16_0
   return vmull_laneq_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32_0
   return vmull_laneq_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16_0
   return vmull_laneq_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32_0
   return vmull_laneq_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16_0
   return vmull_high_laneq_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32_0
   return vmull_high_laneq_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16_0
   return vmull_high_laneq_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32_0
   return vmull_high_laneq_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16_0
   return vqdmlal_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32_0
   return vqdmlal_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16_0
   return vqdmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32_0
   return vqdmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16_0
   return vqdmlsl_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32_0
   return vqdmlsl_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16_0
   return vqdmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32_0
   return vqdmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16_0
   return vqdmull_lane_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32_0
   return vqdmull_lane_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16_0
   return vqdmull_laneq_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32_0
   return vqdmull_laneq_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16_0
   return vqdmull_high_lane_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32_0
   return vqdmull_high_lane_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16_0
   return vqdmull_high_laneq_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32_0
   return vqdmull_high_laneq_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16_0
   return vqdmulh_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16_0
   return vqdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32_0
   return vqdmulh_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32_0
   return vqdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16_0
   return vqrdmulh_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16_0
   return vqrdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32_0
   return vqrdmulh_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32_0
   return vqrdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32_0
   return vmul_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32_0
   return vmulq_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32_0
   return vmul_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64_0
   return vmul_laneq_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32_0
   return vmulq_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64_0
   return vmulq_laneq_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32_0
   return vmulx_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32_0
   return vmulxq_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64_0
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32_0
   return vmulx_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32_0
   return vmulxq_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64_0
   return vmulxq_laneq_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s16
   return vmull_high_n_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s32
   return vmull_high_n_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u16
   return vmull_high_n_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u32
   return vmull_high_n_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
+// CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s16
   return vqdmull_high_n_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s32
   return vqdmull_high_n_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s16
   return vmlal_high_n_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s32
   return vmlal_high_n_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u16
   return vmlal_high_n_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u32
   return vmlal_high_n_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s16
   return vqdmlal_high_n_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s32
   return vqdmlal_high_n_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s16
   return vmlsl_high_n_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s32
   return vmlsl_high_n_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u16
   return vmlsl_high_n_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u32
   return vmlsl_high_n_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s16
   return vqdmlsl_high_n_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s32
   return vqdmlsl_high_n_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
-  // CHECK-LABEL: test_vmul_n_f32
   return vmul_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
-  // CHECK-LABEL: test_vmulq_n_f32
   return vmulq_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
-  // CHECK-LABEL: test_vmulq_n_f64
   return vmulq_n_f64(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfma_n_f32
   return vfma_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmaq_n_f32
   return vfmaq_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLS_I_I]], <2 x float> [[TMP4]], <2 x float> [[FMLS1_I_I]]) #2
+// CHECK:   ret <2 x float> [[FMLS2_I_I]]
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfms_n_f32
   return vfms_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLS_I_I]], <4 x float> [[TMP4]], <4 x float> [[FMLS1_I_I]]) #2
+// CHECK:   ret <4 x float> [[FMLS2_I_I]]
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmsq_n_f32
   return vfmsq_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmul_n_s16
   return vmul_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmulq_n_s16
   return vmulq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmul_n_s32
   return vmul_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmulq_n_s32
   return vmulq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmul_n_u16
   return vmul_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmulq_n_u16
   return vmulq_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmul_n_u32
   return vmul_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmulq_n_u32
   return vmulq_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_n_s16
   return vmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_n_s32
   return vmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_n_u16
   return vmull_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_n_u32
   return vmull_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s16
   return vqdmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s32
   return vqdmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s16
   return vqdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s16
   return vqdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s32
   return vqdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s32
   return vqdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s16
   return vqrdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s16
   return vqrdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s32
   return vqrdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s32
   return vqrdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmla_n_s16
   return vmla_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s16
   return vmlaq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmla_n_s32
   return vmla_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s32
   return vmlaq_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmla_n_u16
   return vmla_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u16
   return vmlaq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmla_n_u32
   return vmla_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u32
   return vmlaq_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_n_s16
   return vmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_n_s32
   return vmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_n_u16
   return vmlal_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_n_u32
   return vmlal_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s16
   return vqdmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s32
   return vqdmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmls_n_s16
   return vmls_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s16
   return vmlsq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmls_n_s32
   return vmls_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s32
   return vmlsq_n_s32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmls_n_u16
   return vmls_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u16
   return vmlsq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmls_n_u32
   return vmls_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u32
   return vmlsq_n_u32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s16
   return vmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s32
   return vmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u16
   return vmlsl_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u32
   return vmlsl_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s16
   return vqdmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s32
   return vqdmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16_0
   return vmla_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16_0
   return vmlaq_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32_0
   return vmla_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32_0
   return vmlaq_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16_0
   return vmla_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16_0
   return vmlaq_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32_0
   return vmla_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32_0
   return vmlaq_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16_0
   return vqdmlal_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32_0
   return vqdmlal_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16_0
   return vqdmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32_0
   return vqdmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16_0
   return vmls_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16_0
   return vmlsq_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32_0
   return vmls_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32_0
   return vmlsq_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16_0
   return vmls_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16_0
   return vmlsq_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32_0
   return vmls_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32_0
   return vmlsq_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16_0
   return vqdmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32_0
   return vqdmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16_0
   return vqdmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32_0
   return vqdmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16_0
   return vqdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16_0
   return vqdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32_0
   return vqdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32_0
   return vqdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16_0
   return vqrdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16_0
   return vqrdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32_0
   return vqrdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32_0
   return vqrdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16
   return vmla_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16
   return vmlaq_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32
   return vmla_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32
   return vmlaq_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16
   return vmla_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16
   return vmlaq_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32
   return vmla_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32
   return vmlaq_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16
   return vqdmlal_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32
   return vqdmlal_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16
   return vqdmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32
   return vqdmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16
   return vmls_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16
   return vmlsq_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32
   return vmls_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32
   return vmlsq_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16
   return vmls_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16
   return vmlsq_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32
   return vmls_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32
   return vmlsq_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16
   return vqdmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32
   return vqdmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16
   return vqdmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32
   return vqdmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16
   return vqdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16
   return vqdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32
   return vqdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32
   return vqdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16
   return vqrdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16
   return vqrdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32
   return vqrdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32
   return vqrdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
Index: test/CodeGen/aarch64-neon-3v.c
===================================================================
--- test/CodeGen/aarch64-neon-3v.c
+++ test/CodeGen/aarch64-neon-3v.c
@@ -1,486 +1,597 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vand_s8
   return vand_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vandq_s8
   return vandq_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vand_s16
   return vand_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vandq_s16
   return vandq_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vand_s32
   return vand_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vandq_s32
   return vandq_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vand_s64
   return vand_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vandq_s64
   return vandq_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vand_u8
   return vand_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vandq_u8
   return vandq_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vand_u16
   return vand_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vandq_u16
   return vandq_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vand_u32
   return vand_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vandq_u32
   return vandq_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vand_u64
   return vand_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vandq_u64
   return vandq_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorr_s8
   return vorr_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vorrq_s8
   return vorrq_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorr_s16
   return vorr_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vorrq_s16
   return vorrq_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorr_s32
   return vorr_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vorrq_s32
   return vorrq_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorr_s64
   return vorr_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vorrq_s64
   return vorrq_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorr_u8
   return vorr_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vorrq_u8
   return vorrq_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorr_u16
   return vorr_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vorrq_u16
   return vorrq_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorr_u32
   return vorr_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vorrq_u32
   return vorrq_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorr_u64
   return vorr_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vorrq_u64
   return vorrq_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_veor_s8
   return veor_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_veorq_s8
   return veorq_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_veor_s16
   return veor_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_veorq_s16
   return veorq_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_veor_s32
   return veor_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_veorq_s32
   return veorq_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_veor_s64
   return veor_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_veorq_s64
   return veorq_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_veor_u8
   return veor_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_veorq_u8
   return veorq_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_veor_u16
   return veor_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_veorq_u16
   return veorq_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_veor_u32
   return veor_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_veorq_u32
   return veorq_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_veor_u64
   return veor_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_veorq_u64
   return veorq_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vbic_s8
   return vbic_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vbicq_s8
   return vbicq_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vbic_s16
   return vbic_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vbicq_s16
   return vbicq_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vbic_s32
   return vbic_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vbicq_s32
   return vbicq_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vbic_s64
   return vbic_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vbicq_s64
   return vbicq_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vbic_u8
   return vbic_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vbicq_u8
   return vbicq_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vbic_u16
   return vbic_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vbicq_u16
   return vbicq_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vbic_u32
   return vbic_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vbicq_u32
   return vbicq_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vbic_u64
   return vbic_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vbicq_u64
   return vbicq_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorn_s8
   return vorn_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vornq_s8
   return vornq_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorn_s16
   return vorn_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vornq_s16
   return vornq_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorn_s32
   return vorn_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vornq_s32
   return vornq_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorn_s64
   return vorn_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vornq_s64
   return vornq_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorn_u8
   return vorn_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vornq_u8
   return vornq_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorn_u16
   return vorn_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vornq_u16
   return vornq_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorn_u32
   return vorn_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vornq_u32
   return vornq_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorn_u64
   return vorn_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vornq_u64
   return vornq_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
Index: test/CodeGen/aarch64-neon-across.c
===================================================================
--- test/CodeGen/aarch64-neon-across.c
+++ test/CodeGen/aarch64-neon-across.c
@@ -1,271 +1,398 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i16 @test_vaddlv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_s8
   return vaddlv_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_s16
   return vaddlv_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_u8
   return vaddlv_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_u16
   return vaddlv_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_s8
   return vaddlvq_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_s16
   return vaddlvq_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_S32_I]]
 int64_t test_vaddlvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_s32
   return vaddlvq_s32(a);
-  // CHECK: saddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_u8
   return vaddlvq_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_u16
   return vaddlvq_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_U32_I]]
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_u32
   return vaddlvq_u32(a);
-  // CHECK: uaddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_s8
   return vmaxv_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_s16
   return vmaxv_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_u8
   return vmaxv_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_u16
   return vmaxv_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_s8
   return vmaxvq_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_s16
   return vmaxvq_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_S32_I]]
 int32_t test_vmaxvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_s32
   return vmaxvq_s32(a);
-  // CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_u8
   return vmaxvq_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_u16
   return vmaxvq_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_U32_I]]
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_u32
   return vmaxvq_u32(a);
-  // CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vminv_s8
   return vminv_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vminv_s16
   return vminv_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vminv_u8
   return vminv_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vminv_u16
   return vminv_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vminvq_s8
   return vminvq_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vminvq_s16
   return vminvq_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_S32_I]]
 int32_t test_vminvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vminvq_s32
   return vminvq_s32(a);
-  // CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vminvq_u8
   return vminvq_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vminvq_u16
   return vminvq_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_U32_I]]
 uint32_t test_vminvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vminvq_u32
   return vminvq_u32(a);
-  // CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddv_s8
   return vaddv_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddv_s16
   return vaddv_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddv_u8
   return vaddv_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddv_u16
   return vaddv_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_s8
   return vaddvq_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_s16
   return vaddvq_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_S32_I]]
 int32_t test_vaddvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_s32
   return vaddvq_s32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_u8
   return vaddvq_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_u16
   return vaddvq_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_U32_I]]
 uint32_t test_vaddvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_u32
   return vaddvq_u32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXVQ_F32_I]]
 float32_t test_vmaxvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_f32
   return vmaxvq_f32(a);
-  // CHECK: fmaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINVQ_F32_I]]
 float32_t test_vminvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminvq_f32
   return vminvq_f32(a);
-  // CHECK: fminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXNMVQ_F32_I]]
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f32
   return vmaxnmvq_f32(a);
-  // CHECK: fmaxnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINNMVQ_F32_I]]
 float32_t test_vminnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminnmvq_f32
   return vminnmvq_f32(a);
-  // CHECK: fminnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
Index: test/CodeGen/aarch64-neon-extract.c
===================================================================
--- test/CodeGen/aarch64-neon-extract.c
+++ test/CodeGen/aarch64-neon-extract.c
@@ -1,148 +1,247 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vext_s8
   return vext_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vext_s16
   return vext_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vext_s32
   return vext_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vext_s64
   return vext_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vextq_s8
   return vextq_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vextq_s16
   return vextq_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vextq_s32
   return vextq_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vextq_s64
   return vextq_s64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vext_u8
   return vext_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vext_u16
   return vext_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vext_u32
   return vext_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vext_u64
   return vext_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vextq_u8
   return vextq_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vextq_u16
   return vextq_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vextq_u32
   return vextq_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vextq_u64
   return vextq_u64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vext_f32
   return vext_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[VEXT]]
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vext_f64
   return vext_f64(a, b, 0);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vextq_f32
   return vextq_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x double> [[VEXT]]
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vextq_f64
   return vextq_f64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vext_p8
   return vext_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vext_p16
   return vext_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vextq_p8
   return vextq_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vextq_p16
   return vextq_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
Index: test/CodeGen/aarch64-neon-fcvt-intrinsics.c
===================================================================
--- test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@@ -1,133 +1,153 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define float @test_vcvtxd_f32_f64(double %a) #0 {
+// CHECK:   [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2
+// CHECK:   ret float [[VCVTXD_F32_F64_I]]
 float32_t test_vcvtxd_f32_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtxd_f32_f64
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
   return (float32_t)vcvtxd_f32_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_S32_F32_I]]
 int32_t test_vcvtas_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_s32_f32
-// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtas_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_test_vcvtad_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_S64_F64_I]]
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
-// CHECK-LABEL: test_test_vcvtad_s64_f64
-// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtad_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_U32_F32_I]]
 uint32_t test_vcvtas_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_u32_f32
-// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtas_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtad_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_U64_F64_I]]
 uint64_t test_vcvtad_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtad_u64_f64
-// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_S32_F32_I]]
 int32_t test_vcvtms_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_s32_f32
-// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtms_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_S64_F64_I]]
 int64_t test_vcvtmd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_s64_f64
-// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_U32_F32_I]]
 uint32_t test_vcvtms_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_u32_f32
-// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtms_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_U64_F64_I]]
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_u64_f64
-// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_S32_F32_I]]
 int32_t test_vcvtns_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_s32_f32
-// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtns_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_S64_F64_I]]
 int64_t test_vcvtnd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_s64_f64
-// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_U32_F32_I]]
 uint32_t test_vcvtns_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_u32_f32
-// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtns_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_U64_F64_I]]
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_u64_f64
-// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_S32_F32_I]]
 int32_t test_vcvtps_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_s32_f32
-// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtps_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_S64_F64_I]]
 int64_t test_vcvtpd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_s64_f64
-// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_U32_F32_I]]
 uint32_t test_vcvtps_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_u32_f32
-// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtps_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_U64_F64_I]]
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_u64_f64
-// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 int32_t test_vcvts_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_s32_f32
-// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvts_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vcvtd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_s64_f64
-// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 uint32_t test_vcvts_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_u32_f32
-// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvts_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vcvtd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_u64_f64
-// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtd_u64_f64(a);
 }
Index: test/CodeGen/aarch64-neon-fma.c
===================================================================
--- test/CodeGen/aarch64-neon-fma.c
+++ test/CodeGen/aarch64-neon-fma.c
@@ -1,199 +1,243 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmla_n_f32
   return vmla_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f32
   return vmlaq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f64
   return vmlaq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f32
   return vmlsq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmls_n_f32
   return vmls_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f64
   return vmlsq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32_0
   return vmla_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32_0
   return vmlaq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32_0
   return vmla_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32_0
   return vmlaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32_0
   return vmls_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32_0
   return vmlsq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32_0
   return vmls_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32_0
   return vmlsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32
   return vmla_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32
   return vmlaq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32
   return vmla_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32
   return vmlaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32
   return vmls_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32
   return vmlsq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32
   return vmls_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32
   return vmlsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmaq_n_f64:
   return vfmaq_n_f64(a, b, c);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLS_I_I]], <2 x double> [[TMP4]], <2 x double> [[FMLS1_I_I]]) #2
+// CHECK:   ret <2 x double> [[FMLS2_I_I]]
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmsq_n_f64:
   return vfmsq_n_f64(a, b, c);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
Index: test/CodeGen/aarch64-neon-intrinsics.c
===================================================================
--- test/CodeGen/aarch64-neon-intrinsics.c
+++ test/CodeGen/aarch64-neon-intrinsics.c
@@ -1,334 +1,394 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+// RUN:     -fallow-half-arguments-and-returns -ffp-contract=fast -S -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vadd_s8
   return vadd_s8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vadd_s16
   return vadd_s16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vadd_s32
   return vadd_s32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
-  // CHECK-LABEL: test_vadd_s64
   return vadd_s64(v1, v2);
-  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vadd_f32
   return vadd_f32(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vadd_u8
   return vadd_u8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vadd_u16
   return vadd_u16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vadd_u32
   return vadd_u32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
-   // CHECK-LABEL: test_vadd_u64
   return vadd_u64(v1, v2);
-  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vaddq_s8
   return vaddq_s8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vaddq_s16
   return vaddq_s16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t  v2) {
-   // CHECK-LABEL: test_vaddq_s32
   return vaddq_s32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vaddq_s64
   return vaddq_s64(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vaddq_f32
   return vaddq_f32(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vaddq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vaddq_f64
   return vaddq_f64(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vaddq_u8
   return vaddq_u8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vaddq_u16
   return vaddq_u16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK: vaddq_u32
   return vaddq_u32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vaddq_u64
   return vaddq_u64(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vsub_s8
   return vsub_s8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vsub_s16
   return vsub_s16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vsub_s32
   return vsub_s32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
-   // CHECK-LABEL: test_vsub_s64
   return vsub_s64(v1, v2);
-  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vsub_f32
   return vsub_f32(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vsub_u8
   return vsub_u8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vsub_u16
   return vsub_u16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vsub_u32
   return vsub_u32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
-   // CHECK-LABEL: test_vsub_u64
   return vsub_u64(v1, v2);
-  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vsubq_s8
   return vsubq_s8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vsubq_s16
   return vsubq_s16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t  v2) {
-   // CHECK-LABEL: test_vsubq_s32
   return vsubq_s32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vsubq_s64
   return vsubq_s64(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vsubq_f32
   return vsubq_f32(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vsubq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vsubq_f64
   return vsubq_f64(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vsubq_u8
   return vsubq_u8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vsubq_u16
   return vsubq_u16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK: vsubq_u32
   return vsubq_u32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vsubq_u64
   return vsubq_u64(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vmul_s8
   return vmul_s8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vmul_s16
   return vmul_s16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vmul_s32
   return vmul_s32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vmul_f32
   return vmul_f32(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vmul_u8
   return vmul_u8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vmul_u16
   return vmul_u16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vmul_u32
   return vmul_u32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vmulq_s8
   return vmulq_s8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vmulq_s16
   return vmulq_s16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_s32
   return vmulq_s32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
     
+// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vmulq_u8
   return vmulq_u8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vmulq_u16
   return vmulq_u16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_u32
   return vmulq_u32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_f32
   return vmulq_f32(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vmulq_f64
   return vmulq_f64(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
   //  test_vmul_p8
   return vmul_p8(v1, v2);
   //  pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
   // test_vmulq_p8
   return vmulq_p8(v1, v2);
@@ -336,1295 +396,2132 @@
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vmla_s8
   return vmla_s8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmla_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vmla_s16
   return vmla_s16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vmla_s32
   return vmla_s32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vmla_f32
   return vmla_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vmla_u8
   return vmla_u8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vmla_u16
   return vmla_u16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vmla_u32
   return vmla_u32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vmlaq_s8
   return vmlaq_s8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vmlaq_s16
   return vmlaq_s16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_s32
   return vmlaq_s32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 } 
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_f32
   return vmlaq_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-   // CHECK-LABEL: test_vmlaq_u8
   return vmlaq_u8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vmlaq_u16
   return vmlaq_u16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_u32
   return vmlaq_u32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vmlaq_f64
   return vmlaq_f64(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vmls_s8
   return vmls_s8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmls_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vmls_s16
   return vmls_s16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vmls_s32
   return vmls_s32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vmls_f32
   return vmls_f32(v1, v2, v3);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vmls_u8
   return vmls_u8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vmls_u16
   return vmls_u16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vmls_u32
   return vmls_u32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vmlsq_s8
   return vmlsq_s8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vmlsq_s16
   return vmlsq_s16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_s32
   return vmlsq_s32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_f32
   return vmlsq_f32(v1, v2, v3);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vmlsq_u8
   return vmlsq_u8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vmlsq_u16
   return vmlsq_u16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_u32
   return vmlsq_u32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vmlsq_f64
   return vmlsq_f64(v1, v2, v3);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vfma_f32
   return vfma_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vfmaq_f32
   return vfmaq_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vfmaq_f64
   return vfmaq_f64(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[FMLS1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLS2_I:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLS_I]], <2 x float> [[TMP4]], <2 x float> [[FMLS1_I]]) #4
+// CHECK:   ret <2 x float> [[FMLS2_I]]
 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vfms_f32
   return vfms_f32(v1, v2, v3);
-  // CHECK: fmls v0.2s, {{v1.2s, v2.2s|v2.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[FMLS1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLS2_I:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLS_I]], <4 x float> [[TMP4]], <4 x float> [[FMLS1_I]]) #4
+// CHECK:   ret <4 x float> [[FMLS2_I]]
 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vfmsq_f32
   return vfmsq_f32(v1, v2, v3);
-  // CHECK: fmls v0.4s, {{v1.4s, v2.4s|v2.4s, v1.4s}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[FMLS1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLS2_I:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLS_I]], <2 x double> [[TMP4]], <2 x double> [[FMLS1_I]]) #4
+// CHECK:   ret <2 x double> [[FMLS2_I]]
 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK: vfmsq_f64
   return vfmsq_f64(v1, v2, v3);
-  // CHECK: fmls v0.2d, {{v1.2d, v2.2d|v2.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vdivq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[DIV_I]]
 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vdivq_f64
   return vdivq_f64(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vdivq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[DIV_I]]
 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vdivq_f32
   return vdivq_f32(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vdiv_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[DIV_I]]
 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vdiv_f32
   return vdiv_f32(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vaba_s8
   return vaba_s8(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vaba_s16
   return vaba_s16(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vaba_s32
   return vaba_s32(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vaba_u8
   return vaba_u8(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vaba_u16
   return vaba_u16(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vaba_u32
   return vaba_u32(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vabaq_s8
   return vabaq_s8(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vabaq_s16
   return vabaq_s16(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vabaq_s32
   return vabaq_s32(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vabaq_u8
   return vabaq_u8(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vabaq_u16
   return vabaq_u16(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vabaq_u32
   return vabaq_u32(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vabd_s8
   return vabd_s8(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vabd_s16
   return vabd_s16(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vabd_s32
   return vabd_s32(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vabd_u8
   return vabd_u8(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vabd_u16
   return vabd_u16(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vabd_u32
   return vabd_u32(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4
+// CHECK:   ret <2 x float> [[VABD2_I]]
 float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vabd_f32
   return vabd_f32(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vabdq_s8
   return vabdq_s8(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vabdq_s16
   return vabdq_s16(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_s32
   return vabdq_s32(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vabdq_u8
   return vabdq_u8(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vabdq_u16
   return vabdq_u16(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_u32
   return vabdq_u32(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4
+// CHECK:   ret <4 x float> [[VABD2_I]]
 float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_f32
   return vabdq_f32(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vabdq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]]) #4
+// CHECK:   ret <2 x double> [[VABD2_I]]
 float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vabdq_f64
   return vabdq_f64(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_s8
   return vbsl_s8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP4]]
 int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_s16
   return vbsl_s16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_s32
   return vbsl_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_s64
   return vbsl_s64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_u8
   return vbsl_u8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_u16
   return vbsl_u16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_u32
   return vbsl_u32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_u64
   return vbsl_u64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP4:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP5]]
 float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_f32
   return vbsl_f32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_f64
   return vbsl_f64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_p8
   return vbsl_p8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_p16
   return vbsl_p16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_s8
   return vbslq_s8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_s16
   return vbslq_s16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_s32
   return vbslq_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_s64
   return vbslq_s64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_u8
   return vbslq_u8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_u16
   return vbslq_u16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_u32
   return vbslq_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_u64
   return vbslq_u64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP4]]
 float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_f32
   return vbslq_f32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_p8
   return vbslq_p8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_p16
   return vbslq_p16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP4]]
 float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_f64
   return vbslq_f64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vrecps_f32
    return vrecps_f32(v1, v2);
-   // CHECK: frecps {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vrecpsq_f32
    return vrecpsq_f32(v1, v2);
-   // CHECK: frecps {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrecpsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[VRECPSQ_V_I]], <2 x double> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
-   // CHECK-LABEL: test_vrecpsq_f64
   return vrecpsq_f64(v1, v2);
-  // CHECK: frecps {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vrsqrts_f32
   return vrsqrts_f32(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vrsqrtsq_f32
   return vrsqrtsq_f32(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrsqrtsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[VRSQRTSQ_V_I]], <2 x double> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
-   // CHECK-LABEL: test_vrsqrtsq_f64
   return vrsqrtsq_f64(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcage_f32
   return vcage_f32(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCAGE_V_I]], <1 x double> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCAGE_V2_I]]
 uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcage_f64
   return vcage_f64(a, b);
-  // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcageq_f32
   return vcageq_f32(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcageq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCAGEQ_V_I]], <2 x double> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCAGEQ_V2_I]]
 uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcageq_f64
   return vcageq_f64(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcagt_f32
   return vcagt_f32(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCAGT_V_I]], <1 x double> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCAGT_V2_I]]
 uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcagt_f64
   return vcagt_f64(a, b);
-  // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcagtq_f32
   return vcagtq_f32(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcagtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCAGTQ_V_I]], <2 x double> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCAGTQ_V2_I]]
 uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcagtq_f64
   return vcagtq_f64(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcale_f32
   return vcale_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCALE_V_I]], <1 x double> [[VCALE_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCALE_V2_I]]
 uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcale_f64
   return vcale_f64(a, b);
-  // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcaleq_f32
   return vcaleq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcaleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCALEQ_V_I]], <2 x double> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCALEQ_V2_I]]
 uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcaleq_f64
   return vcaleq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcalt_f32
   return vcalt_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCALT_V_I]], <1 x double> [[VCALT_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCALT_V2_I]]
 uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcalt_f64
   return vcalt_f64(a, b);
-  // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcaltq_f32
   return vcaltq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcaltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCALTQ_V_I]], <2 x double> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCALTQ_V2_I]]
 uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcaltq_f64
   return vcaltq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vtst_s8
   return vtst_s8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vtst_s16
   return vtst_s16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vtst_s32
   return vtst_s32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vtst_u8
   return vtst_u8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vtst_u16
   return vtst_u16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vtst_u32
   return vtst_u32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_s8
   return vtstq_s8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_s16
   return vtstq_s16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
-   // CHECK-LABEL: test_vtstq_s32
   return vtstq_s32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_u8
   return vtstq_u8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_u16
   return vtstq_u16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK-LABEL: test_vtstq_u32
   return vtstq_u32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vtstq_s64
   return vtstq_s64(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vtstq_u64
   return vtstq_u64(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
-   // CHECK-LABEL: test_vtst_p8
   return vtst_p8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) {
-   // CHECK-LABEL: test_vtst_p16
   return vtst_p16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_p8
   return vtstq_p8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_p16
   return vtstq_p16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vtst_s64
   return vtst_s64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vtst_u64
   return vtst_u64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vceq_s8
   return vceq_s8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vceq_s16
   return vceq_s16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vceq_s32
   return vceq_s32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vceq_s64
   return vceq_s64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vceq_u64
   return vceq_u64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vceq_f32
   return vceq_f32(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vceq_f64
   return vceq_f64(a, b);
-  // CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vceq_u8
   return vceq_u8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vceq_u16
   return vceq_u16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vceq_u32
   return vceq_u32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
-  // CHECK-LABEL: test_vceq_p8
   return vceq_p8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_s8
   return vceqq_s8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vceqq_s16
   return vceqq_s16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_s32
   return vceqq_s32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_f32
   return vceqq_f32(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_u8
   return vceqq_u8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vceqq_u16
   return vceqq_u16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_u32
   return vceqq_u32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_p8
   return vceqq_p8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_s64
   return vceqq_s64(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_u64
   return vceqq_u64(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_f64
   return vceqq_f64(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vcge_s8
   return vcge_s8(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vcge_s16
   return vcge_s16(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vcge_s32
   return vcge_s32(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcge_s64
   return vcge_s64(a, b);
-  // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcge_u64
   return vcge_u64(a, b);
-  // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
-// CHECK-LABEL: test_vcge_f32
   return vcge_f32(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcge_f64
   return vcge_f64(a, b);
-  // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vcge_u8
   return vcge_u8(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vcge_u16
   return vcge_u16(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vcge_u32
   return vcge_u32(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vcgeq_s8
   return vcgeq_s8(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vcgeq_s16
   return vcgeq_s16(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_s32
   return vcgeq_s32(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_f32
   return vcgeq_f32(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vcgeq_u8
   return vcgeq_u8(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vcgeq_u16
   return vcgeq_u16(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_u32
   return vcgeq_u32(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgeq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_s64
   return vcgeq_s64(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgeq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_u64
   return vcgeq_u64(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgeq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_f64
   return vcgeq_f64(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 // Notes about vcle:
 // LE condition predicate implemented as GE, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
+// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vcle_s8
   return vcle_s8(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vcle_s16
   return vcle_s16(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vcle_s32
   return vcle_s32(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcle_s64
   return vcle_s64(a, b);
-  // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcle_u64
   return vcle_u64(a, b);
-  // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcle_f32
   return vcle_f32(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcle_f64
   return vcle_f64(a, b);
-  // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vcle_u8
   return vcle_u8(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vcle_u16
   return vcle_u16(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vcle_u32
   return vcle_u32(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcleq_s8
   return vcleq_s8(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcleq_s16
   return vcleq_s16(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_s32
   return vcleq_s32(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_f32
   return vcleq_f32(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcleq_u8
   return vcleq_u8(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcleq_u16
   return vcleq_u16(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_u32
   return vcleq_u32(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcleq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_s64
   return vcleq_s64(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcleq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_u64
   return vcleq_u64(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_f64
   return vcleq_f64(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vcgt_s8
   return vcgt_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vcgt_s16
   return vcgt_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_s32
   return vcgt_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcgt_s64
   return vcgt_s64(a, b);
-  // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcgt_u64
   return vcgt_u64(a, b);
-  // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_f32
   return vcgt_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcgt_f64
   return vcgt_f64(a, b);
-  // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vcgt_u8
   return vcgt_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vcgt_u16
   return vcgt_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_u32
   return vcgt_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcgtq_s8
   return vcgtq_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcgtq_s16
   return vcgtq_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_s32
   return vcgtq_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_f32
   return vcgtq_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcgtq_u8
   return vcgtq_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcgtq_u16
   return vcgtq_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_u32
   return vcgtq_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgtq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_s64
   return vcgtq_s64(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgtq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_u64
   return vcgtq_u64(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_f64
   return vcgtq_f64(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
@@ -1632,10112 +2529,20029 @@
 // LT condition predicate implemented as GT, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
 
+// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vclt_s8
   return vclt_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vclt_s16
   return vclt_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vclt_s32
   return vclt_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vclt_s64
   return vclt_s64(a, b);
-  // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vclt_u64
   return vclt_u64(a, b);
-  // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vclt_f32
   return vclt_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vclt_f64
   return vclt_f64(a, b);
-  // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vclt_u8
   return vclt_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vclt_u16
   return vclt_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vclt_u32
   return vclt_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcltq_s8
   return vcltq_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcltq_s16
   return vcltq_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_s32
   return vcltq_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_f32
   return vcltq_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcltq_u8
   return vcltq_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcltq_u16
   return vcltq_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_u32
   return vcltq_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcltq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_s64
   return vcltq_s64(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcltq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_u64
   return vcltq_u64(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_f64
   return vcltq_f64(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vhadd_s8
   return vhadd_s8(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vhadd_s16
   return vhadd_s16(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vhadd_s32
   return vhadd_s32(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vhadd_u8
   return vhadd_u8(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vhadd_u16
   return vhadd_u16(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vhadd_u32
   return vhadd_u32(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vhaddq_s8
   return vhaddq_s8(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vhaddq_s16
   return vhaddq_s16(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vhaddq_s32
   return vhaddq_s32(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vhaddq_u8
   return vhaddq_u8(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vhaddq_u16
   return vhaddq_u16(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vhaddq_u32
   return vhaddq_u32(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vhsub_s8
   return vhsub_s8(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vhsub_s16
   return vhsub_s16(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vhsub_s32
   return vhsub_s32(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vhsub_u8
   return vhsub_u8(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vhsub_u16
   return vhsub_u16(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vhsub_u32
   return vhsub_u32(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vhsubq_s8
   return vhsubq_s8(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vhsubq_s16
   return vhsubq_s16(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vhsubq_s32
   return vhsubq_s32(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vhsubq_u8
   return vhsubq_u8(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vhsubq_u16
   return vhsubq_u16(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vhsubq_u32
   return vhsubq_u32(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vrhadd_s8
   return vrhadd_s8(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vrhadd_s16
   return vrhadd_s16(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vrhadd_s32
   return vrhadd_s32(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vrhadd_u8
   return vrhadd_u8(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vrhadd_u16
   return vrhadd_u16(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vrhadd_u32
   return vrhadd_u32(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vrhaddq_s8
   return vrhaddq_s8(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vrhaddq_s16
   return vrhaddq_s16(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vrhaddq_s32
   return vrhaddq_s32(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vrhaddq_u8
   return vrhaddq_u8(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vrhaddq_u16
   return vrhaddq_u16(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vrhaddq_u32
   return vrhaddq_u32(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqadd_s8
   return vqadd_s8(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqadd_s16
   return vqadd_s16(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqadd_s32
   return vqadd_s32(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqadd_s64
   return vqadd_s64(a, b);
-// CHECK:  sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vqadd_u8
   return vqadd_u8(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vqadd_u16
   return vqadd_u16(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vqadd_u32
   return vqadd_u32(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK:  test_vqadd_u64
   return vqadd_u64(a, b);
-// CHECK:  uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqaddq_s8
   return vqaddq_s8(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqaddq_s16
   return vqaddq_s16(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqaddq_s32
   return vqaddq_s32(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqaddq_s64
   return vqaddq_s64(a, b);
-// CHECK: sqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vqaddq_u8
   return vqaddq_u8(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vqaddq_u16
   return vqaddq_u16(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vqaddq_u32
   return vqaddq_u32(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
-// CHECK-LABEL: test_vqaddq_u64
   return vqaddq_u64(a, b);
-// CHECK: uqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqsub_s8
   return vqsub_s8(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqsub_s16
   return vqsub_s16(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqsub_s32
   return vqsub_s32(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqsub_s64
   return vqsub_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vqsub_u8
   return vqsub_u8(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vqsub_u16
   return vqsub_u16(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vqsub_u32
   return vqsub_u32(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vqsub_u64
   return vqsub_u64(a, b);
-// CHECK:  uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqsubq_s8
   return vqsubq_s8(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqsubq_s16
   return vqsubq_s16(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqsubq_s32
   return vqsubq_s32(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqsubq_s64
   return vqsubq_s64(a, b);
-// CHECK: sqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vqsubq_u8
   return vqsubq_u8(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vqsubq_u16
   return vqsubq_u16(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vqsubq_u32
   return vqsubq_u32(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
-// CHECK-LABEL: test_vqsubq_u64
   return vqsubq_u64(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vshl_s8
   return vshl_s8(a, b);
-// CHECK: sshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vshl_s16
   return vshl_s16(a, b);
-// CHECK: sshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vshl_s32
   return vshl_s32(a, b);
-// CHECK: sshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vshl_s64
   return vshl_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vshl_u8
   return vshl_u8(a, b);
-// CHECK: ushl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vshl_u16
   return vshl_u16(a, b);
-// CHECK: ushl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vshl_u32
   return vshl_u32(a, b);
-// CHECK: ushl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vshl_u64
   return vshl_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vshlq_s8
   return vshlq_s8(a, b);
-// CHECK: sshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vshlq_s16
   return vshlq_s16(a, b);
-// CHECK: sshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vshlq_s32
   return vshlq_s32(a, b);
-// CHECK: sshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vshlq_s64
   return vshlq_s64(a, b);
-// CHECK: sshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vshlq_u8
   return vshlq_u8(a, b);
-// CHECK: ushl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vshlq_u16
   return vshlq_u16(a, b);
-// CHECK: ushl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vshlq_u32
   return vshlq_u32(a, b);
-// CHECK: ushl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vshlq_u64
   return vshlq_u64(a, b);
-// CHECK: ushl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqshl_s8
   return vqshl_s8(a, b);
-// CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqshl_s16
   return vqshl_s16(a, b);
-// CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqshl_s32
   return vqshl_s32(a, b);
-// CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqshl_s64
   return vqshl_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqshl_u8
   return vqshl_u8(a, b);
-// CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqshl_u16
   return vqshl_u16(a, b);
-// CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqshl_u32
   return vqshl_u32(a, b);
-// CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqshl_u64
   return vqshl_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqshlq_s8
   return vqshlq_s8(a, b);
-// CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqshlq_s16
   return vqshlq_s16(a, b);
-// CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqshlq_s32
   return vqshlq_s32(a, b);
-// CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqshlq_s64
   return vqshlq_s64(a, b);
-// CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqshlq_u8
   return vqshlq_u8(a, b);
-// CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqshlq_u16
   return vqshlq_u16(a, b);
-// CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqshlq_u32
   return vqshlq_u32(a, b);
-// CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqshlq_u64
   return vqshlq_u64(a, b);
-// CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vrshl_s8
   return vrshl_s8(a, b);
-// CHECK: srshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vrshl_s16
   return vrshl_s16(a, b);
-// CHECK: srshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vrshl_s32
   return vrshl_s32(a, b);
-// CHECK: srshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrshl_s64
   return vrshl_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vrshl_u8
   return vrshl_u8(a, b);
-// CHECK: urshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vrshl_u16
   return vrshl_u16(a, b);
-// CHECK: urshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vrshl_u32
   return vrshl_u32(a, b);
-// CHECK: urshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrshl_u64
   return vrshl_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vrshlq_s8
   return vrshlq_s8(a, b);
-// CHECK: srshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vrshlq_s16
   return vrshlq_s16(a, b);
-// CHECK: srshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vrshlq_s32
   return vrshlq_s32(a, b);
-// CHECK: srshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vrshlq_s64
   return vrshlq_s64(a, b);
-// CHECK: srshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vrshlq_u8
   return vrshlq_u8(a, b);
-// CHECK: urshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vrshlq_u16
   return vrshlq_u16(a, b);
-// CHECK: urshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vrshlq_u32
   return vrshlq_u32(a, b);
-// CHECK: urshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vrshlq_u64
   return vrshlq_u64(a, b);
-// CHECK: urshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqrshl_s8
   return vqrshl_s8(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrshl_s16
   return vqrshl_s16(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrshl_s32
   return vqrshl_s32(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqrshl_s64
   return vqrshl_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqrshl_u8
   return vqrshl_u8(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrshl_u16
   return vqrshl_u16(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrshl_u32
   return vqrshl_u32(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqrshl_u64
   return vqrshl_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqrshlq_s8
   return vqrshlq_s8(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrshlq_s16
   return vqrshlq_s16(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrshlq_s32
   return vqrshlq_s32(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqrshlq_s64
   return vqrshlq_s64(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrshlq_u16
   return vqrshlq_u16(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrshlq_u32
   return vqrshlq_u32(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqrshlq_u64
   return vqrshlq_u64(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 0)
+// CHECK:   ret <1 x i64> [[VSLI_N]]2
 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
-// CHECK-LABEL: test_vsli_n_p64
   return vsli_n_p64(a, b, 0); 
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 0)
+// CHECK:   ret <2 x i64> [[VSLI_N]]2
 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) {
-// CHECK-LABEL: test_vsliq_n_p64
   return vsliq_n_p64(a, b, 0); 
-// CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vmax_s8
   return vmax_s8(a, b);
-// CHECK: smax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vmax_s16
   return vmax_s16(a, b);
-// CHECK: smax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vmax_s32
   return vmax_s32(a, b);
-// CHECK: smax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vmax_u8
   return vmax_u8(a, b);
-// CHECK: umax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vmax_u16
   return vmax_u16(a, b);
-// CHECK: umax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vmax_u32
   return vmax_u32(a, b);
-// CHECK: umax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VMAX2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmax_f32
   return vmax_f32(a, b);
-// CHECK: fmax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vmaxq_s8
   return vmaxq_s8(a, b);
-// CHECK: smax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vmaxq_s16
   return vmaxq_s16(a, b);
-// CHECK: smax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vmaxq_s32
   return vmaxq_s32(a, b);
-// CHECK: smax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vmaxq_u8
   return vmaxq_u8(a, b);
-// CHECK: umax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vmaxq_u16
   return vmaxq_u16(a, b);
-// CHECK: umax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vmaxq_u32
   return vmaxq_u32(a, b);
-// CHECK: umax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x float> [[VMAX2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmaxq_f32
   return vmaxq_f32(a, b);
-// CHECK: fmax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[VMAX_I]], <2 x double> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x double> [[VMAX2_I]]
 float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmaxq_f64
   return vmaxq_f64(a, b);
-// CHECK: fmax {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vmin_s8
   return vmin_s8(a, b);
-// CHECK: smin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vmin_s16
   return vmin_s16(a, b);
-// CHECK: smin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vmin_s32
   return vmin_s32(a, b);
-// CHECK: smin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vmin_u8
   return vmin_u8(a, b);
-// CHECK: umin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vmin_u16
   return vmin_u16(a, b);
-// CHECK: umin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vmin_u32
   return vmin_u32(a, b);
-// CHECK: umin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VMIN2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmin_f32
   return vmin_f32(a, b);
-// CHECK: fmin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vminq_s8
   return vminq_s8(a, b);
-// CHECK: smin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vminq_s16
   return vminq_s16(a, b);
-// CHECK: smin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vminq_s32
   return vminq_s32(a, b);
-// CHECK: smin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vminq_u8
   return vminq_u8(a, b);
-// CHECK: umin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vminq_u16
   return vminq_u16(a, b);
-// CHECK: umin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vminq_u32
   return vminq_u32(a, b);
-// CHECK: umin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x float> [[VMIN2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vminq_f32
   return vminq_f32(a, b);
-// CHECK: fmin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vminq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[VMIN_I]], <2 x double> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x double> [[VMIN2_I]]
 float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vminq_f64
   return vminq_f64(a, b);
-// CHECK: fmin {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[VMAXNM_I]], <2 x float> [[VMAXNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VMAXNM2_I]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmaxnm_f32
   return vmaxnm_f32(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[VMAXNM_I]], <4 x float> [[VMAXNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VMAXNM2_I]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmaxnmq_f32
   return vmaxnmq_f32(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[VMAXNM_I]], <2 x double> [[VMAXNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VMAXNM2_I]]
 float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmaxnmq_f64
   return vmaxnmq_f64(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[VMINNM_I]], <2 x float> [[VMINNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VMINNM2_I]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vminnm_f32
   return vminnm_f32(a, b);
-// CHECK: fminnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[VMINNM_I]], <4 x float> [[VMINNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VMINNM2_I]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vminnmq_f32
   return vminnmq_f32(a, b);
-// CHECK: fminnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[VMINNM_I]], <2 x double> [[VMINNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VMINNM2_I]]
 float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vminnmq_f64
   return vminnmq_f64(a, b);
-// CHECK: fminnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpmax_s8
   return vpmax_s8(a, b);
-// CHECK: smaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpmax_s16
   return vpmax_s16(a, b);
-// CHECK: smaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpmax_s32
   return vpmax_s32(a, b);
-// CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpmax_u8
   return vpmax_u8(a, b);
-// CHECK: umaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpmax_u16
   return vpmax_u16(a, b);
-// CHECK: umaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpmax_u32
   return vpmax_u32(a, b);
-// CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMAX2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmax_f32
   return vpmax_f32(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMAX_I]]
 int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpmaxq_s8
   return vpmaxq_s8(a, b);
-// CHECK: smaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMAX2_I]]
 int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpmaxq_s16
   return vpmaxq_s16(a, b);
-// CHECK: smaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMAX2_I]]
 int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_s32
   return vpmaxq_s32(a, b);
-// CHECK: smaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMAX_I]]
 uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpmaxq_u8
   return vpmaxq_u8(a, b);
-// CHECK: umaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMAX2_I]]
 uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpmaxq_u16
   return vpmaxq_u16(a, b);
-// CHECK: umaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMAX2_I]]
 uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_u32
   return vpmaxq_u32(a, b);
-// CHECK: umaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[VPMAX_I]], <4 x float> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMAX2_I]]
 float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_f32
   return vpmaxq_f32(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[VPMAX_I]], <2 x double> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMAX2_I]]
 float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpmaxq_f64
   return vpmaxq_f64(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpmin_s8
   return vpmin_s8(a, b);
-// CHECK: sminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpmin_s16
   return vpmin_s16(a, b);
-// CHECK: sminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpmin_s32
   return vpmin_s32(a, b);
-// CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpmin_u8
   return vpmin_u8(a, b);
-// CHECK: uminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpmin_u16
   return vpmin_u16(a, b);
-// CHECK: uminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpmin_u32
   return vpmin_u32(a, b);
-// CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMIN2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmin_f32
   return vpmin_f32(a, b);
-// CHECK: fminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMIN_I]]
 int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpminq_s8
   return vpminq_s8(a, b);
-// CHECK: sminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMIN2_I]]
 int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpminq_s16
   return vpminq_s16(a, b);
-// CHECK: sminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMIN2_I]]
 int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpminq_s32
   return vpminq_s32(a, b);
-// CHECK: sminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMIN_I]]
 uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpminq_u8
   return vpminq_u8(a, b);
-// CHECK: uminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMIN2_I]]
 uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpminq_u16
   return vpminq_u16(a, b);
-// CHECK: uminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMIN2_I]]
 uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpminq_u32
   return vpminq_u32(a, b);
-// CHECK: uminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[VPMIN_I]], <4 x float> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMIN2_I]]
 float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpminq_f32
   return vpminq_f32(a, b);
-// CHECK: fminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpminq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[VPMIN_I]], <2 x double> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMIN2_I]]
 float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpminq_f64
   return vpminq_f64(a, b);
-// CHECK: fminp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[VPMAXNM_I]], <2 x float> [[VPMAXNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMAXNM2_I]]
 float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmaxnm_f32
   return vpmaxnm_f32(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[VPMAXNM_I]], <4 x float> [[VPMAXNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMAXNM2_I]]
 float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpmaxnmq_f32
   return vpmaxnmq_f32(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[VPMAXNM_I]], <2 x double> [[VPMAXNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMAXNM2_I]]
 float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpmaxnmq_f64
   return vpmaxnmq_f64(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[VPMINNM_I]], <2 x float> [[VPMINNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMINNM2_I]]
 float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpminnm_f32
   return vpminnm_f32(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[VPMINNM_I]], <4 x float> [[VPMINNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMINNM2_I]]
 float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpminnmq_f32
   return vpminnmq_f32(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[VPMINNM_I]], <2 x double> [[VPMINNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMINNM2_I]]
 float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpminnmq_f64
   return vpminnmq_f64(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpadd_s8
   return vpadd_s8(a, b);
-// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpadd_s16
   return vpadd_s16(a, b);
-// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpadd_s32
   return vpadd_s32(a, b);
-// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpadd_u8
   return vpadd_u8(a, b);
-// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpadd_u16
   return vpadd_u16(a, b);
-// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpadd_u32
   return vpadd_u32(a, b);
-// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpadd_f32
   return vpadd_f32(a, b);
-// CHECK: faddp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpaddq_s8
   return vpaddq_s8(a, b);
-// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpaddq_s16
   return vpaddq_s16(a, b);
-// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpaddq_s32
   return vpaddq_s32(a, b);
-// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpaddq_u8
   return vpaddq_u8(a, b);
-// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpaddq_u16
   return vpaddq_u16(a, b);
-// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpaddq_u32
   return vpaddq_u32(a, b);
-// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> [[VPADDQ_V_I]], <4 x float> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpaddq_f32
   return vpaddq_f32(a, b);
-// CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpaddq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> [[VPADDQ_V_I]], <2 x double> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpaddq_f64
   return vpaddq_f64(a, b);
-// CHECK: faddp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqdmulh_s16
   return vqdmulh_s16(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqdmulh_s32
   return vqdmulh_s32(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqdmulhq_s16
   return vqdmulhq_s16(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqdmulhq_s32
   return vqdmulhq_s32(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrdmulh_s16
   return vqrdmulh_s16(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrdmulh_s32
   return vqrdmulh_s32(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrdmulhq_s16
   return vqrdmulhq_s16(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrdmulhq_s32
   return vqrdmulhq_s32(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #4
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulx_f32
   return vmulx_f32(a, b);
-// CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #4
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxq_f32
   return vmulxq_f32(a, b);
-// CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #4
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxq_f64
   return vmulxq_f64(a, b);
-// CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
-// CHECK-LABEL: test_vshl_n_s8
   return vshl_n_s8(a, 3);
-// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_s16(int16x4_t a) {
-// CHECK-LABEL: test_vshl_n_s16
   return vshl_n_s16(a, 3);
-// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_s32(int32x2_t a) {
-// CHECK-LABEL: test_vshl_n_s32
   return vshl_n_s32(a, 3);
-// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
-// CHECK-LABEL: test_vshlq_n_s8
   return vshlq_n_s8(a, 3);
-// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
-// CHECK-LABEL: test_vshlq_n_s16
   return vshlq_n_s16(a, 3);
-// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
-// CHECK-LABEL: test_vshlq_n_s32
   return vshlq_n_s32(a, 3);
-// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
-// CHECK-LABEL: test_vshlq_n_s64
   return vshlq_n_s64(a, 3);
-// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_u8(int8x8_t a) {
-// CHECK-LABEL: test_vshl_n_u8
   return vshl_n_u8(a, 3);
-// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_u16(int16x4_t a) {
-// CHECK-LABEL: test_vshl_n_u16
   return vshl_n_u16(a, 3);
-// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_u32(int32x2_t a) {
-// CHECK-LABEL: test_vshl_n_u32
   return vshl_n_u32(a, 3);
-// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_u8(int8x16_t a) {
-// CHECK-LABEL: test_vshlq_n_u8
   return vshlq_n_u8(a, 3);
-// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_u16(int16x8_t a) {
-// CHECK-LABEL: test_vshlq_n_u16
   return vshlq_n_u16(a, 3);
-// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_u32(int32x4_t a) {
-// CHECK-LABEL: test_vshlq_n_u32
   return vshlq_n_u32(a, 3);
-// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_u64(int64x2_t a) {
-// CHECK-LABEL: test_vshlq_n_u64
   return vshlq_n_u64(a, 3);
-// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshr_n_s8
   return vshr_n_s8(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshr_n_s16
   return vshr_n_s16(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshr_n_s32
   return vshr_n_s32(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshrq_n_s8
   return vshrq_n_s8(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrq_n_s16
   return vshrq_n_s16(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrq_n_s32
   return vshrq_n_s32(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrq_n_s64
   return vshrq_n_s64(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_u8(int8x8_t a) {
-  // CHECK-LABEL: test_vshr_n_u8
   return vshr_n_u8(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_u16(int16x4_t a) {
-  // CHECK-LABEL: test_vshr_n_u16
   return vshr_n_u16(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_u32(int32x2_t a) {
-  // CHECK-LABEL: test_vshr_n_u32
   return vshr_n_u32(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_u8(int8x16_t a) {
-  // CHECK-LABEL: test_vshrq_n_u8
   return vshrq_n_u8(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_u16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrq_n_u16
   return vshrq_n_u16(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_u32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrq_n_u32
   return vshrq_n_u32(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_u64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrq_n_u64
   return vshrq_n_u64(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsra_n_s8
   return vsra_n_s8(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsra_n_s16
   return vsra_n_s16(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsra_n_s32
   return vsra_n_s32(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsraq_n_s8
   return vsraq_n_s8(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsraq_n_s16
   return vsraq_n_s16(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsraq_n_s32
   return vsraq_n_s32(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 3, i64 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsraq_n_s64
   return vsraq_n_s64(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsra_n_u8
   return vsra_n_u8(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsra_n_u16
   return vsra_n_u16(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsra_n_u32
   return vsra_n_u32(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsraq_n_u8
   return vsraq_n_u8(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsraq_n_u16
   return vsraq_n_u16(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsraq_n_u32
   return vsraq_n_u32(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 3, i64 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsraq_n_u64
   return vsraq_n_u64(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrshr_n_s8
   return vrshr_n_s8(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <4 x i16> [[VRSHR_N]]1
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vrshr_n_s16
   return vrshr_n_s16(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   ret <2 x i32> [[VRSHR_N]]1
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vrshr_n_s32
   return vrshr_n_s32(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s8
   return vrshrq_n_s8(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <8 x i16> [[VRSHR_N]]1
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s16
   return vrshrq_n_s16(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   ret <4 x i32> [[VRSHR_N]]1
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s32
   return vrshrq_n_s32(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   ret <2 x i64> [[VRSHR_N]]1
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s64
   return vrshrq_n_s64(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_u8(int8x8_t a) {
-  // CHECK-LABEL: test_vrshr_n_u8
   return vrshr_n_u8(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <4 x i16> [[VRSHR_N]]1
 int16x4_t test_vrshr_n_u16(int16x4_t a) {
-  // CHECK-LABEL: test_vrshr_n_u16
   return vrshr_n_u16(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   ret <2 x i32> [[VRSHR_N]]1
 int32x2_t test_vrshr_n_u32(int32x2_t a) {
-  // CHECK-LABEL: test_vrshr_n_u32
   return vrshr_n_u32(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_u8(int8x16_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u8
   return vrshrq_n_u8(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <8 x i16> [[VRSHR_N]]1
 int16x8_t test_vrshrq_n_u16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u16
   return vrshrq_n_u16(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   ret <4 x i32> [[VRSHR_N]]1
 int32x4_t test_vrshrq_n_u32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u32
   return vrshrq_n_u32(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   ret <2 x i64> [[VRSHR_N]]1
 int64x2_t test_vrshrq_n_u64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u64
   return vrshrq_n_u64(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vrsra_n_s8
   return vrsra_n_s8(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vrsra_n_s16
   return vrsra_n_s16(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vrsra_n_s32
   return vrsra_n_s32(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s8
   return vrsraq_n_s8(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s16
   return vrsraq_n_s16(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s32
   return vrsraq_n_s32(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s64
   return vrsraq_n_s64(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vrsra_n_u8
   return vrsra_n_u8(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vrsra_n_u16
   return vrsra_n_u16(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vrsra_n_u32
   return vrsra_n_u32(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u8
   return vrsraq_n_u8(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u16
   return vrsraq_n_u16(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u32
   return vrsraq_n_u32(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u64
   return vrsraq_n_u64(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_s8
   return vsri_n_s8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <4 x i16> [[VSRI_N]]2
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_s16
   return vsri_n_s16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <2 x i32> [[VSRI_N]]2
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsri_n_s32
   return vsri_n_s32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_s8
   return vsriq_n_s8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <8 x i16> [[VSRI_N]]2
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_s16
   return vsriq_n_s16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <4 x i32> [[VSRI_N]]2
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsriq_n_s32
   return vsriq_n_s32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <2 x i64> [[VSRI_N]]2
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_s64
   return vsriq_n_s64(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_u8
   return vsri_n_u8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <4 x i16> [[VSRI_N]]2
 int16x4_t test_vsri_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_u16
   return vsri_n_u16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <2 x i32> [[VSRI_N]]2
 int32x2_t test_vsri_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsri_n_u32
   return vsri_n_u32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_u8
   return vsriq_n_u8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <8 x i16> [[VSRI_N]]2
 int16x8_t test_vsriq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_u16
   return vsriq_n_u16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <4 x i32> [[VSRI_N]]2
 int32x4_t test_vsriq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsriq_n_u32
   return vsriq_n_u32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 3)
+// CHECK:   ret <2 x i64> [[VSRI_N]]2
 int64x2_t test_vsriq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_u64
   return vsriq_n_u64(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_p8
   return vsri_n_p8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 15)
+// CHECK:   ret <4 x i16> [[VSRI_N]]2
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_p16
   return vsri_n_p16(a, b, 15);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_p8
   return vsriq_n_p8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 15)
+// CHECK:   ret <8 x i16> [[VSRI_N]]2
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_p16
   return vsriq_n_p16(a, b, 15);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_s8
   return vsli_n_s8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <4 x i16> [[VSLI_N]]2
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_s16
   return vsli_n_s16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <2 x i32> [[VSLI_N]]2
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsli_n_s32
   return vsli_n_s32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_s8
   return vsliq_n_s8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <8 x i16> [[VSLI_N]]2
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_s16
   return vsliq_n_s16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <4 x i32> [[VSLI_N]]2
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsliq_n_s32
   return vsliq_n_s32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <2 x i64> [[VSLI_N]]2
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsliq_n_s64
   return vsliq_n_s64(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_u8
   return vsli_n_u8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <4 x i16> [[VSLI_N]]2
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_u16
   return vsli_n_u16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <2 x i32> [[VSLI_N]]2
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsli_n_u32
   return vsli_n_u32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_u8
   return vsliq_n_u8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <8 x i16> [[VSLI_N]]2
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_u16
   return vsliq_n_u16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <4 x i32> [[VSLI_N]]2
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsliq_n_u32
   return vsliq_n_u32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 3)
+// CHECK:   ret <2 x i64> [[VSLI_N]]2
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsliq_n_u64
   return vsliq_n_u64(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_p8
   return vsli_n_p8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 15)
+// CHECK:   ret <4 x i16> [[VSLI_N]]2
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_p16
   return vsli_n_p16(a, b, 15);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_p8
   return vsliq_n_p8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 15)
+// CHECK:   ret <8 x i16> [[VSLI_N]]2
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_p16
   return vsliq_n_p16(a, b, 15);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 int8x8_t test_vqshlu_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s8
   return vqshlu_n_s8(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N]]1
 int16x4_t test_vqshlu_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s16
   return vqshlu_n_s16(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 3, i32 3>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N]]1
 int32x2_t test_vqshlu_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s32
   return vqshlu_n_s32(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 int8x16_t test_vqshluq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s8
   return vqshluq_n_s8(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N]]1
 int16x8_t test_vqshluq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s16
   return vqshluq_n_s16(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N]]1
 int32x4_t test_vqshluq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s32
   return vqshluq_n_s32(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 3, i64 3>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N]]1
 int64x2_t test_vqshluq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s64
   return vqshluq_n_s64(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrn_n_s16
   return vshrn_n_s16(a, 3);
-  // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrn_n_s32
   return vshrn_n_s32(a, 9);
-  // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrn_n_s64
   return vshrn_n_s64(a, 19);
-  // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshrn_n_u16
   return vshrn_n_u16(a, 3);
-  // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshrn_n_u32
   return vshrn_n_u32(a, 9);
-  // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vshrn_n_u64
   return vshrn_n_u64(a, 19);
-  // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s16
   return vshrn_high_n_s16(a, b, 3);
-  // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s32
   return vshrn_high_n_s32(a, b, 9);
-  // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s64
   return vshrn_high_n_s64(a, b, 19);
-  // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u16
   return vshrn_high_n_u16(a, b, 3);
-  // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u32
   return vshrn_high_n_u32(a, b, 9);
-  // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u64
   return vshrn_high_n_u64(a, b, 19);
-  // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N]]1
 int8x8_t test_vqshrun_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s16
   return vqshrun_n_s16(a, 3);
-  // CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N]]1
 int16x4_t test_vqshrun_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s32
   return vqshrun_n_s32(a, 9);
-  // CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N]]1
 int32x2_t test_vqshrun_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s64
   return vqshrun_n_s64(a, 19);
-  // CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s16
   return vqshrun_high_n_s16(a, b, 3);
-  // CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s32
   return vqshrun_high_n_s32(a, b, 9);
-  // CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s64
   return vqshrun_high_n_s64(a, b, 19);
-  // CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VRSHRN_N]]1
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s16
   return vrshrn_n_s16(a, 3);
-  // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VRSHRN_N]]1
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s32
   return vrshrn_n_s32(a, 9);
-  // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VRSHRN_N]]1
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s64
   return vrshrn_n_s64(a, 19);
-  // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VRSHRN_N]]1
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u16
   return vrshrn_n_u16(a, 3);
-  // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VRSHRN_N]]1
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u32
   return vrshrn_n_u32(a, 9);
-  // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VRSHRN_N]]1
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u64
   return vrshrn_n_u64(a, 19);
-  // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s16
   return vrshrn_high_n_s16(a, b, 3);
-  // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s32
   return vrshrn_high_n_s32(a, b, 9);
-  // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s64
   return vrshrn_high_n_s64(a, b, 19);
-  // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u16
   return vrshrn_high_n_u16(a, b, 3);
-  // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u32
   return vrshrn_high_n_u32(a, b, 9);
-  // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u64
   return vrshrn_high_n_u64(a, b, 19);
-  // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N]]1
 int8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s16
   return vqrshrun_n_s16(a, 3);
-  // CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N]]1
 int16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s32
   return vqrshrun_n_s32(a, 9);
-  // CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N]]1
 int32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s64
   return vqrshrun_n_s64(a, 19);
-  // CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s16
   return vqrshrun_high_n_s16(a, b, 3);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s32
   return vqrshrun_high_n_s32(a, b, 9);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s64
   return vqrshrun_high_n_s64(a, b, 19);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRN_N]]1
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s16
   return vqshrn_n_s16(a, 3);
-  // CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRN_N]]1
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s32
   return vqshrn_n_s32(a, 9);
-  // CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRN_N]]1
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s64
   return vqshrn_n_s64(a, 19);
-  // CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRN_N]]1
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u16
   return vqshrn_n_u16(a, 3);
-  // CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRN_N]]1
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u32
   return vqshrn_n_u32(a, 9);
-  // CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRN_N]]1
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u64
   return vqshrn_n_u64(a, 19);
-  // CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s16
   return vqshrn_high_n_s16(a, b, 3);
-  // CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s32
   return vqshrn_high_n_s32(a, b, 9);
-  // CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s64
   return vqshrn_high_n_s64(a, b, 19);
-  // CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u16
   return vqshrn_high_n_u16(a, b, 3);
-  // CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u32
   return vqshrn_high_n_u32(a, b, 9);
-  // CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u64
   return vqshrn_high_n_u64(a, b, 19);
-  // CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N]]1
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s16
   return vqrshrn_n_s16(a, 3);
-  // CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N]]1
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s32
   return vqrshrn_n_s32(a, 9);
-  // CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N]]1
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s64
   return vqrshrn_n_s64(a, 19);
-  // CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N]]1
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u16
   return vqrshrn_n_u16(a, 3);
-  // CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N]]1
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u32
   return vqrshrn_n_u32(a, 9);
-  // CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N]]1
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u64
   return vqrshrn_n_u64(a, 19);
-  // CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s16
   return vqrshrn_high_n_s16(a, b, 3);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s32
   return vqrshrn_high_n_s32(a, b, 9);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s64
   return vqrshrn_high_n_s64(a, b, 19);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N]]1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u16
   return vqrshrn_high_n_u16(a, b, 3);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N]]1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u32
   return vqrshrn_high_n_u32(a, b, 9);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N]]1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u64
   return vqrshrn_high_n_u64(a, b, 19);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-// CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 3);
-// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-// CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 9);
-// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-// CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 19);
-// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-// CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 3);
-// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-// CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 9);
-// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 19);
-// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-// CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 3);
-// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-// CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 9);
-// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-// CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 19);
-// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-// CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 3);
-// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-// CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 9);
-// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 19);
-// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
-// CHECK-LABEL: test_vmovl_s8
   return vmovl_s8(a);
-// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
-// CHECK-LABEL: test_vmovl_s16
   return vmovl_s16(a);
-// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
-// CHECK-LABEL: test_vmovl_s32
   return vmovl_s32(a);
-// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
-// CHECK-LABEL: test_vmovl_u8
   return vmovl_u8(a);
-// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
-// CHECK-LABEL: test_vmovl_u16
   return vmovl_u16(a);
-// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vmovl_u32
   return vmovl_u32(a);
-// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vmovl_high_s8(int8x16_t a) {
-// CHECK-LABEL: test_vmovl_high_s8
   return vmovl_high_s8(a);
-// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vmovl_high_s16(int16x8_t a) {
-// CHECK-LABEL: test_vmovl_high_s16
   return vmovl_high_s16(a);
-// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vmovl_high_s32(int32x4_t a) {
-// CHECK-LABEL: test_vmovl_high_s32
   return vmovl_high_s32(a);
-// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vmovl_high_u8(uint8x16_t a) {
-// CHECK-LABEL: test_vmovl_high_u8
   return vmovl_high_u8(a);
-// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vmovl_high_u16(uint16x8_t a) {
-// CHECK-LABEL: test_vmovl_high_u16
   return vmovl_high_u16(a);
-// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vmovl_high_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vmovl_high_u32
   return vmovl_high_u32(a);
-// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x float> [[VCVT_N]]1
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_f32_s32
   return vcvt_n_f32_s32(a, 31);
-  // CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x float> [[VCVT_N]]1
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f32_s32
   return vcvtq_n_f32_s32(a, 31);
-  // CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x double> [[VCVT_N]]1
 float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f64_s64
   return vcvtq_n_f64_s64(a, 50);
-  // CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x float> [[VCVT_N]]1
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_f32_u32
   return vcvt_n_f32_u32(a, 31);
-  // CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x float> [[VCVT_N]]1
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f32_u32
   return vcvtq_n_f32_u32(a, 31);
-  // CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x double> [[VCVT_N]]1
 float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f64_u64
   return vcvtq_n_f64_u64(a, 50);
-  // CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x i32> [[VCVT_N]]1
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_s32_f32
   return vcvt_n_s32_f32(a, 31);
-  // CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x i32> [[VCVT_N]]1
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_s32_f32
   return vcvtq_n_s32_f32(a, 31);
-  // CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x i64> [[VCVT_N]]1
 int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_s64_f64
   return vcvtq_n_s64_f64(a, 50);
-  // CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x i32> [[VCVT_N]]1
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_u32_f32
   return vcvt_n_u32_f32(a, 31);
-  // CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x i32> [[VCVT_N]]1
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_u32_f32
   return vcvtq_n_u32_f32(a, 31);
-  // CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x i64> [[VCVT_N]]1
 uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_u64_f64
   return vcvtq_n_u64_f64(a, 50);
-  // CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vaddl_s8
   return vaddl_s8(a, b);
-  // CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vaddl_s16
   return vaddl_s16(a, b);
-  // CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vaddl_s32
   return vaddl_s32(a, b);
-  // CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vaddl_u8
   return vaddl_u8(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vaddl_u16
   return vaddl_u16(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vaddl_u32
   return vaddl_u32(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vaddl_high_s8
   return vaddl_high_s8(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddl_high_s16
   return vaddl_high_s16(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddl_high_s32
   return vaddl_high_s32(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vaddl_high_u8
   return vaddl_high_u8(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddl_high_u16
   return vaddl_high_u16(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddl_high_u32
   return vaddl_high_u32(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vaddw_s8
   return vaddw_s8(a, b);
-  // CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vaddw_s16
   return vaddw_s16(a, b);
-  // CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vaddw_s32
   return vaddw_s32(a, b);
-  // CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vaddw_u8
   return vaddw_u8(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vaddw_u16
   return vaddw_u16(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vaddw_u32
   return vaddw_u32(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vaddw_high_s8
   return vaddw_high_s8(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddw_high_s16
   return vaddw_high_s16(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddw_high_s32
   return vaddw_high_s32(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vaddw_high_u8
   return vaddw_high_u8(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddw_high_u16
   return vaddw_high_u16(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddw_high_u32
   return vaddw_high_u32(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsubl_s8
   return vsubl_s8(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsubl_s16
   return vsubl_s16(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsubl_s32
   return vsubl_s32(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsubl_u8
   return vsubl_u8(a, b);
-  // CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsubl_u16
   return vsubl_u16(a, b);
-  // CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsubl_u32
   return vsubl_u32(a, b);
-  // CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsubl_high_s8
   return vsubl_high_s8(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubl_high_s16
   return vsubl_high_s16(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubl_high_s32
   return vsubl_high_s32(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsubl_high_u8
   return vsubl_high_u8(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubl_high_u16
   return vsubl_high_u16(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubl_high_u32
   return vsubl_high_u32(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsubw_s8
   return vsubw_s8(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsubw_s16
   return vsubw_s16(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsubw_s32
   return vsubw_s32(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsubw_u8
   return vsubw_u8(a, b);
-  // CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsubw_u16
   return vsubw_u16(a, b);
-  // CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsubw_u32
   return vsubw_u32(a, b);
-  // CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsubw_high_s8
   return vsubw_high_s8(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubw_high_s16
   return vsubw_high_s16(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubw_high_s32
   return vsubw_high_s32(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsubw_high_u8
   return vsubw_high_u8(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubw_high_u16
   return vsubw_high_u16(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubw_high_u32
   return vsubw_high_u32(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_s16
   return vaddhn_s16(a, b);
-  // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_s32
   return vaddhn_s32(a, b);
-  // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_s64
   return vaddhn_s64(a, b);
-  // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_u16
   return vaddhn_u16(a, b);
-  // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_u32
   return vaddhn_u32(a, b);
-  // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_u64
   return vaddhn_u64(a, b);
-  // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s16
   return vaddhn_high_s16(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s32
   return vaddhn_high_s32(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s64
   return vaddhn_high_s64(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u16
   return vaddhn_high_u16(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u32
   return vaddhn_high_u32(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u64
   return vaddhn_high_u64(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_s16
   return vraddhn_s16(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_s32
   return vraddhn_s32(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_s64
   return vraddhn_s64(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_u16
   return vraddhn_u16(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_u32
   return vraddhn_u32(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_u64
   return vraddhn_u64(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s16
   return vraddhn_high_s16(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s32
   return vraddhn_high_s32(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s64
   return vraddhn_high_s64(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u16
   return vraddhn_high_u16(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u32
   return vraddhn_high_u32(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u64
   return vraddhn_high_u64(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_s16
   return vsubhn_s16(a, b);
-  // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_s32
   return vsubhn_s32(a, b);
-  // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_s64
   return vsubhn_s64(a, b);
-  // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_u16
   return vsubhn_u16(a, b);
-  // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_u32
   return vsubhn_u32(a, b);
-  // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_u64
   return vsubhn_u64(a, b);
-  // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s16
   return vsubhn_high_s16(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s32
   return vsubhn_high_s32(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s64
   return vsubhn_high_s64(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u16
   return vsubhn_high_u16(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u32
   return vsubhn_high_u32(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u64
   return vsubhn_high_u64(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_s16
   return vrsubhn_s16(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_s32
   return vrsubhn_s32(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_s64
   return vrsubhn_s64(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_u16
   return vrsubhn_u16(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_u32
   return vrsubhn_u32(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_u64
   return vrsubhn_u64(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s16
   return vrsubhn_high_s16(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s32
   return vrsubhn_high_s32(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s64
   return vrsubhn_high_s64(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u16
   return vrsubhn_high_u16(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u32
   return vrsubhn_high_u32(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u64
   return vrsubhn_high_u64(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vabdl_s8
   return vabdl_s8(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vabdl_s16
   return vabdl_s16(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vabdl_s32
   return vabdl_s32(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vabdl_u8
   return vabdl_u8(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vabdl_u16
   return vabdl_u16(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vabdl_u32
   return vabdl_u32(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vabal_s8
   return vabal_s8(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vabal_s16
   return vabal_s16(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vabal_s32
   return vabal_s32(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vabal_u8
   return vabal_u8(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vabal_u16
   return vabal_u16(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vabal_u32
   return vabal_u32(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vabdl_high_s8
   return vabdl_high_s8(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vabdl_high_s16
   return vabdl_high_s16(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vabdl_high_s32
   return vabdl_high_s32(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vabdl_high_u8
   return vabdl_high_u8(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vabdl_high_u16
   return vabdl_high_u16(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vabdl_high_u32
   return vabdl_high_u32(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vabal_high_s8
   return vabal_high_s8(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vabal_high_s16
   return vabal_high_s16(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vabal_high_s32
   return vabal_high_s32(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vabal_high_u8
   return vabal_high_u8(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vabal_high_u16
   return vabal_high_u16(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vabal_high_u32
   return vabal_high_u32(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vmull_s8
   return vmull_s8(a, b);
-  // CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vmull_s16
   return vmull_s16(a, b);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vmull_s32
   return vmull_s32(a, b);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vmull_u8
   return vmull_u8(a, b);
-  // CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vmull_u16
   return vmull_u16(a, b);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vmull_u32
   return vmull_u32(a, b);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_s8
   return vmull_high_s8(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmull_high_s16
   return vmull_high_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmull_high_s32
   return vmull_high_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_u8
   return vmull_high_u8(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vmull_high_u16
   return vmull_high_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vmull_high_u32
   return vmull_high_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vmlal_s8
   return vmlal_s8(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vmlal_s16
   return vmlal_s16(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vmlal_s32
   return vmlal_s32(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vmlal_u8
   return vmlal_u8(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vmlal_u16
   return vmlal_u16(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vmlal_u32
   return vmlal_u32(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vmlal_high_s8
   return vmlal_high_s8(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vmlal_high_s16
   return vmlal_high_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vmlal_high_s32
   return vmlal_high_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vmlal_high_u8
   return vmlal_high_u8(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vmlal_high_u16
   return vmlal_high_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vmlal_high_u32
   return vmlal_high_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vmlsl_s8
   return vmlsl_s8(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vmlsl_s16
   return vmlsl_s16(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vmlsl_s32
   return vmlsl_s32(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vmlsl_u8
   return vmlsl_u8(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vmlsl_u16
   return vmlsl_u16(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vmlsl_u32
   return vmlsl_u32(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I_I]]
 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s8
   return vmlsl_high_s8(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s16
   return vmlsl_high_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s32
   return vmlsl_high_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I_I]]
 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u8
   return vmlsl_high_u8(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u16
   return vmlsl_high_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u32
   return vmlsl_high_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vqdmull_s16
   return vqdmull_s16(a, b);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vqdmull_s32
   return vqdmull_s32(a, b);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vqdmlal_s16
   return vqdmlal_s16(a, b, c);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vqdmlal_s32
   return vqdmlal_s32(a, b, c);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vqdmlsl_s16
   return vqdmlsl_s16(a, b, c);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vqdmlsl_s32
   return vqdmlsl_s32(a, b, c);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) #4
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqdmull_high_s16
   return vqdmull_high_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) #4
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqdmull_high_s32
   return vqdmull_high_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I_I]]
 int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_s16
   return vqdmlal_high_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I_I]]
 int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_s32
   return vqdmlal_high_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I_I]]
 int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_s16
   return vqdmlsl_high_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I_I]]
 int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_s32
   return vqdmlsl_high_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vmull_p8
   return vmull_p8(a, b);
-  // CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_p8
   return vmull_high_p8(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i64 @test_vaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
+// CHECK:   ret i64 [[VADDD_I]]
 int64_t test_vaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vaddd_s64
   return vaddd_s64(a, b);
-// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
+// CHECK:   ret i64 [[VADDD_I]]
 uint64_t test_vaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vaddd_u64
   return vaddd_u64(a, b);
-// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vsubd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
+// CHECK:   ret i64 [[VSUBD_I]]
 int64_t test_vsubd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsubd_s64
   return vsubd_s64(a, b);
-// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vsubd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
+// CHECK:   ret i64 [[VSUBD_I]]
 uint64_t test_vsubd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsubd_u64
   return vsubd_u64(a, b);
-// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqaddb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqaddb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vqaddb_s8
   return vqaddb_s8(a, b);
-// CHECK: sqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqaddh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqaddh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqaddh_s16
   return vqaddh_s16(a, b);
-// CHECK: sqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqadds_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQADDS_S32_I]]
 int32_t test_vqadds_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqadds_s32
   return vqadds_s32(a, b);
-// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQADDD_S64_I]]
 int64_t test_vqaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vqaddd_s64
   return vqaddd_s64(a, b);
-// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqaddb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vqaddb_u8
   return vqaddb_u8(a, b);
-// CHECK: uqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqaddh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vqaddh_u16
   return vqaddh_u16(a, b);
-// CHECK: uqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqadds_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQADDS_U32_I]]
 uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vqadds_u32
   return vqadds_u32(a, b);
-// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQADDD_U64_I]]
 uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vqaddd_u64
   return vqaddd_u64(a, b);
-// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqsubb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqsubb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vqsubb_s8
   return vqsubb_s8(a, b);
-// CHECK: sqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqsubh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqsubh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqsubh_s16
   return vqsubh_s16(a, b);
-// CHECK: sqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqsubs_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSUBS_S32_I]]
 int32_t test_vqsubs_s32(int32_t a, int32_t b) {
-  // CHECK-LABEL: test_vqsubs_s32
   return vqsubs_s32(a, b);
-// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqsubd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSUBD_S64_I]]
 int64_t test_vqsubd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vqsubd_s64
   return vqsubd_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqsubb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vqsubb_u8
   return vqsubb_u8(a, b);
-// CHECK: uqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqsubh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vqsubh_u16
   return vqsubh_u16(a, b);
-// CHECK: uqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqsubs_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSUBS_U32_I]]
 uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vqsubs_u32
   return vqsubs_u32(a, b);
-// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqsubd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSUBD_U64_I]]
 uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vqsubd_u64
   return vqsubd_u64(a, b);
-// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSHLD_S64_I]]
 int64_t test_vshld_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vshld_s64
   return vshld_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSHLD_U64_I]]
 uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vshld_u64
   return vshld_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshlb_s8
+// CHECK-LABEL: define i8 @test_vqshlb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqshlb_s8(int8_t a, int8_t b) {
   return vqshlb_s8(a, b);
-// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqshlh_s16
+// CHECK-LABEL: define i16 @test_vqshlh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqshlh_s16(int16_t a, int16_t b) {
   return vqshlh_s16(a, b);
-// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqshls_s32
+// CHECK-LABEL: define i32 @test_vqshls_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSHLS_S32_I]]
 int32_t test_vqshls_s32(int32_t a, int32_t b) {
   return vqshls_s32(a, b);
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshld_s64
+// CHECK-LABEL: define i64 @test_vqshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSHLD_S64_I]]
 int64_t test_vqshld_s64(int64_t a, int64_t b) {
   return vqshld_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshlb_u8
+// CHECK-LABEL: define i8 @test_vqshlb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
   return vqshlb_u8(a, b);
-// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqshlh_u16
+// CHECK-LABEL: define i16 @test_vqshlh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
   return vqshlh_u16(a, b);
-// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqshls_u32
+// CHECK-LABEL: define i32 @test_vqshls_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSHLS_U32_I]]
 uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
   return vqshls_u32(a, b);
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshld_u64
+// CHECK-LABEL: define i64 @test_vqshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSHLD_U64_I]]
 uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
   return vqshld_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vrshld_s64
+// CHECK-LABEL: define i64 @test_vrshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VRSHLD_S64_I]]
 int64_t test_vrshld_s64(int64_t a, int64_t b) {
   return vrshld_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vrshld_u64
+// CHECK-LABEL: define i64 @test_vrshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VRSHLD_U64_I]]
 uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
   return vrshld_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshlb_s8
+// CHECK-LABEL: define i8 @test_vqrshlb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
   return vqrshlb_s8(a, b);
-// CHECK: sqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqrshlh_s16
+// CHECK-LABEL: define i16 @test_vqrshlh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
   return vqrshlh_s16(a, b);
-// CHECK: sqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqrshls_s32
+// CHECK-LABEL: define i32 @test_vqrshls_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRSHLS_S32_I]]
 int32_t test_vqrshls_s32(int32_t a, int32_t b) {
   return vqrshls_s32(a, b);
-// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshld_s64
+// CHECK-LABEL: define i64 @test_vqrshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQRSHLD_S64_I]]
 int64_t test_vqrshld_s64(int64_t a, int64_t b) {
   return vqrshld_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshlb_u8
+// CHECK-LABEL: define i8 @test_vqrshlb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
   return vqrshlb_u8(a, b);
-// CHECK: uqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqrshlh_u16
+// CHECK-LABEL: define i16 @test_vqrshlh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
   return vqrshlh_u16(a, b);
-// CHECK: uqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqrshls_u32
+// CHECK-LABEL: define i32 @test_vqrshls_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRSHLS_U32_I]]
 uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
   return vqrshls_u32(a, b);
-// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshld_u64
+// CHECK-LABEL: define i64 @test_vqrshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQRSHLD_U64_I]]
 uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
   return vqrshld_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vpaddd_s64
+// CHECK-LABEL: define i64 @test_vpaddd_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VPADDD_S64_I]]
 int64_t test_vpaddd_s64(int64x2_t a) {
   return vpaddd_s64(a);
-// CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpadds_f32
+// CHECK-LABEL: define float @test_vpadds_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 1
+// CHECK:   [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
+// CHECK:   ret float [[VPADDD_I]]
 float32_t test_vpadds_f32(float32x2_t a) {
   return vpadds_f32(a);
-// CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpaddd_f64
+// CHECK-LABEL: define double @test_vpaddd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+// CHECK:   [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
+// CHECK:   ret double [[VPADDD_I]]
 float64_t test_vpaddd_f64(float64x2_t a) {
   return vpaddd_f64(a);
-// CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmaxnms_f32
+// CHECK-LABEL: define float @test_vpmaxnms_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMAXNMS_F32_I]]
 float32_t test_vpmaxnms_f32(float32x2_t a) {
   return vpmaxnms_f32(a);
-// CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpmaxnmqd_f64
+// CHECK-LABEL: define double @test_vpmaxnmqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMAXNMQD_F64_I]]
 float64_t test_vpmaxnmqd_f64(float64x2_t a) {
   return vpmaxnmqd_f64(a);
-// CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmaxs_f32
+// CHECK-LABEL: define float @test_vpmaxs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMAXS_F32_I]]
 float32_t test_vpmaxs_f32(float32x2_t a) {
   return vpmaxs_f32(a);
-// CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpmaxqd_f64
+// CHECK-LABEL: define double @test_vpmaxqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMAXQD_F64_I]]
 float64_t test_vpmaxqd_f64(float64x2_t a) {
   return vpmaxqd_f64(a);
-// CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpminnms_f32
+// CHECK-LABEL: define float @test_vpminnms_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMINNMS_F32_I]]
 float32_t test_vpminnms_f32(float32x2_t a) {
   return vpminnms_f32(a);
-// CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpminnmqd_f64
+// CHECK-LABEL: define double @test_vpminnmqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMINNMQD_F64_I]]
 float64_t test_vpminnmqd_f64(float64x2_t a) {
   return vpminnmqd_f64(a);
-// CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmins_f32
+// CHECK-LABEL: define float @test_vpmins_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMINS_F32_I]]
 float32_t test_vpmins_f32(float32x2_t a) {
   return vpmins_f32(a);
-// CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpminqd_f64
+// CHECK-LABEL: define double @test_vpminqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMINQD_F64_I]]
 float64_t test_vpminqd_f64(float64x2_t a) {
   return vpminqd_f64(a);
-// CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqdmulhh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqdmulhh_s16
   return vqdmulhh_s16(a, b);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqdmulhs_s32
   return vqdmulhs_s32(a, b);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqrdmulhh_s16
   return vqrdmulhh_s16(a, b);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqrdmulhs_s32
   return vqrdmulhs_s32(a, b);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmulxs_f32(float %a, float %b) #0 {
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) #4
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vmulxs_f32
   return vmulxs_f32(a, b);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vmulxd_f64(double %a, double %b) #0 {
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) #4
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vmulxd_f64
   return vmulxd_f64(a, b);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmulx_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[VMULX_I]], <1 x double> [[VMULX1_I]]) #4
+// CHECK:   ret <1 x double> [[VMULX2_I]]
 float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulx_f64
   return vmulx_f64(a, b);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vrecpss_f32(float %a, float %b) #0 {
+// CHECK:   [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) #4
+// CHECK:   ret float [[VRECPS_I]]
 float32_t test_vrecpss_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vrecpss_f32
   return vrecpss_f32(a, b);
-// CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vrecpsd_f64(double %a, double %b) #0 {
+// CHECK:   [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) #4
+// CHECK:   ret double [[VRECPS_I]]
 float64_t test_vrecpsd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vrecpsd_f64
   return vrecpsd_f64(a, b);
-// CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vrsqrtss_f32(float %a, float %b) #0 {
+// CHECK:   [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) #4
+// CHECK:   ret float [[VRSQRTSS_F32_I]]
 float32_t test_vrsqrtss_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vrsqrtss_f32
   return vrsqrtss_f32(a, b);
-// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vrsqrtsd_f64(double %a, double %b) #0 {
+// CHECK:   [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) #4
+// CHECK:   ret double [[VRSQRTSD_F64_I]]
 float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vrsqrtsd_f64
   return vrsqrtsd_f64(a, b);
-// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vcvts_f32_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sitofp i32 %a to float
+// CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_s32(int32_t a) {
-// CHECK-LABEL: test_vcvts_f32_s32
-// CHECK: scvtf {{s[0-9]+}}, {{[ws][0-9]+}}
   return vcvts_f32_s32(a);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_f64_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sitofp i64 %a to double
+// CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_s64(int64_t a) {
-// CHECK-LABEL: test_vcvtd_f64_s64
-// CHECK: scvtf {{d[0-9]+}}, {{[dx][0-9]+}}
   return vcvtd_f64_s64(a);
 }
 
+// CHECK-LABEL: define float @test_vcvts_f32_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = uitofp i32 %a to float
+// CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_u32(uint32_t a) {
-// CHECK-LABEL: test_vcvts_f32_u32
-// CHECK: ucvtf {{s[0-9]+}}, {{[ws][0-9]+}}
   return vcvts_f32_u32(a);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_f64_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = uitofp i64 %a to double
+// CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_u64(uint64_t a) {
-// CHECK-LABEL: test_vcvtd_f64_u64
-// CHECK: ucvtf {{d[0-9]+}}, {{[xd][0-9]+}}
   return vcvtd_f64_u64(a);
 }
 
+// CHECK-LABEL: define float @test_vrecpes_f32(float %a) #0 {
+// CHECK:   [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) #4
+// CHECK:   ret float [[VRECPES_F32_I]]
 float32_t test_vrecpes_f32(float32_t a) {
-// CHECK-LABEL: test_vrecpes_f32
-// CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
   return vrecpes_f32(a);
 }
  
+// CHECK-LABEL: define double @test_vrecped_f64(double %a) #0 {
+// CHECK:   [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) #4
+// CHECK:   ret double [[VRECPED_F64_I]]
 float64_t test_vrecped_f64(float64_t a) {
-// CHECK-LABEL: test_vrecped_f64
-// CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
   return vrecped_f64(a);
 }
  
+// CHECK-LABEL: define float @test_vrecpxs_f32(float %a) #0 {
+// CHECK:   [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) #4
+// CHECK:   ret float [[VRECPXS_F32_I]]
 float32_t test_vrecpxs_f32(float32_t a) {
-// CHECK-LABEL: test_vrecpxs_f32
-// CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
   return vrecpxs_f32(a);
  }
  
+// CHECK-LABEL: define double @test_vrecpxd_f64(double %a) #0 {
+// CHECK:   [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) #4
+// CHECK:   ret double [[VRECPXD_F64_I]]
 float64_t test_vrecpxd_f64(float64_t a) {
-// CHECK-LABEL: test_vrecpxd_f64
-// CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
   return vrecpxd_f64(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vrsqrte_u32
-// CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   return vrsqrte_u32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vrsqrteq_u32
-// CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   return vrsqrteq_u32(a);
 }
 
+// CHECK-LABEL: define float @test_vrsqrtes_f32(float %a) #0 {
+// CHECK:   [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) #4
+// CHECK:   ret float [[VRSQRTES_F32_I]]
 float32_t test_vrsqrtes_f32(float32_t a) {
-// CHECK: vrsqrtes_f32
-// CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
   return vrsqrtes_f32(a);
 }
 
+// CHECK-LABEL: define double @test_vrsqrted_f64(double %a) #0 {
+// CHECK:   [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) #4
+// CHECK:   ret double [[VRSQRTED_F64_I]]
 float64_t test_vrsqrted_f64(float64_t a) {
-// CHECK: vrsqrted_f64
-// CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
   return vrsqrted_f64(a);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8
   return vld1q_u8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16
   return vld1q_u16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32
   return vld1q_u32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64
   return vld1q_u64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8
   return vld1q_s8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16
   return vld1q_s16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32
   return vld1q_s32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64
   return vld1q_s64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP3]]
 float16x8_t test_vld1q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16
   return vld1q_f16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32
   return vld1q_f32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
+// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vld1q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64
   return vld1q_f64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8
   return vld1q_p8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16
   return vld1q_p16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8
   return vld1_u8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16
   return vld1_u16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32
   return vld1_u32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64
   return vld1_u64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8
   return vld1_s8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16
   return vld1_s16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32
   return vld1_s32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64
   return vld1_s64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP3]]
 float16x4_t test_vld1_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16
   return vld1_f16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32
   return vld1_f32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
+// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vld1_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64
   return vld1_f64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8
   return vld1_p8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16
   return vld1_p16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld2q_u8
   return vld2q_u8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld2q_u16
   return vld2q_u16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld2q_u32
   return vld2q_u32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld2q_u64
   return vld2q_u64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld2q_s8
   return vld2q_s8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld2q_s16
   return vld2q_s16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld2q_s32
   return vld2q_s32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld2q_s64
   return vld2q_s64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld2q_f16
   return vld2q_f16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld2q_f32
   return vld2q_f32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld2q_f64
   return vld2q_f64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld2q_p8
   return vld2q_p8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld2q_p16
   return vld2q_p16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld2_u8
   return vld2_u8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld2_u16
   return vld2_u16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld2_u32
   return vld2_u32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld2_u64
   return vld2_u64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld2_s8
   return vld2_s8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld2_s16
   return vld2_s16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld2_s32
   return vld2_s32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld2_s64
   return vld2_s64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld2_f16
   return vld2_f16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld2_f32
   return vld2_f32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld2_f64
   return vld2_f64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld2_p8
   return vld2_p8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld2_p16
   return vld2_p16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld3q_u8
   return vld3q_u8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld3q_u16
   return vld3q_u16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld3q_u32
   return vld3q_u32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld3q_u64
   return vld3q_u64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld3q_s8
   return vld3q_s8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld3q_s16
   return vld3q_s16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld3q_s32
   return vld3q_s32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld3q_s64
   return vld3q_s64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld3q_f16
   return vld3q_f16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld3q_f32
   return vld3q_f32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld3q_f64
   return vld3q_f64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld3q_p8
   return vld3q_p8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld3q_p16
   return vld3q_p16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld3_u8
   return vld3_u8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld3_u16
   return vld3_u16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld3_u32
   return vld3_u32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld3_u64
   return vld3_u64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld3_s8
   return vld3_s8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld3_s16
   return vld3_s16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld3_s32
   return vld3_s32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld3_s64
   return vld3_s64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld3_f16
   return vld3_f16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld3_f32
   return vld3_f32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld3_f64
   return vld3_f64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld3_p8
   return vld3_p8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld3_p16
   return vld3_p16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld4q_u8
   return vld4q_u8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld4q_u16
   return vld4q_u16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld4q_u32
   return vld4q_u32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld4q_u64
   return vld4q_u64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld4q_s8
   return vld4q_s8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld4q_s16
   return vld4q_s16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld4q_s32
   return vld4q_s32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld4q_s64
   return vld4q_s64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld4q_f16
   return vld4q_f16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld4q_f32
   return vld4q_f32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld4q_f64
   return vld4q_f64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld4q_p8
   return vld4q_p8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld4q_p16
   return vld4q_p16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld4_u8
   return vld4_u8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld4_u16
   return vld4_u16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld4_u32
   return vld4_u32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld4_u64
   return vld4_u64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld4_s8
   return vld4_s8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld4_s16
   return vld4_s16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld4_s32
   return vld4_s32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld4_s64
   return vld4_s64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld4_f16
   return vld4_f16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld4_f32
   return vld4_f32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld4_f64
   return vld4_f64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld4_p8
   return vld4_p8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld4_p16
   return vld4_p16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_u8
   vst1q_u8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_u16
   vst1q_u16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_u32
   vst1q_u32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_u64
   vst1q_u64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_s8(int8_t *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_s8
   vst1q_s8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s16(int16_t *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_s16
   vst1q_s16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s32(int32_t *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_s32
   vst1q_s32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s64(int64_t *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_s64
   vst1q_s64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_f16
   vst1q_f16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   store <4 x float> [[TMP3]], <4 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f32(float32_t *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_f32
   vst1q_f32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   store <2 x double> [[TMP3]], <2 x double>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f64(float64_t *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_f64
   vst1q_f64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_p8
   vst1q_p8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_p16
   vst1q_p16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_u8(uint8_t *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_u8
   vst1_u8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u16(uint16_t *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_u16
   vst1_u16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u32(uint32_t *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_u32
   vst1_u32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u64(uint64_t *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_u64
   vst1_u64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_s8(int8_t *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_s8
   vst1_s8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s16(int16_t *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_s16
   vst1_s16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s32(int32_t *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_s32
   vst1_s32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s64(int64_t *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_s64
   vst1_s64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_f16
   vst1_f16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   store <2 x float> [[TMP3]], <2 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f32(float32_t *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_f32
   vst1_f32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   store <1 x double> [[TMP3]], <1 x double>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f64(float64_t *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_f64
   vst1_f64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_p8(poly8_t *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_p8
   vst1_p8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p16(poly16_t *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_p16
   vst1_p16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_u8
   vst2q_u8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_u16
   vst2q_u16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_u32
   vst2q_u32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_u64
   vst2q_u64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_s8
   vst2q_s8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_s16
   vst2q_s16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_s32
   vst2q_s32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_s64
   vst2q_s64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_f16
   vst2q_f16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_f32
   vst2q_f32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_f64
   vst2q_f64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_p8
   vst2q_p8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_p16
   vst2q_p16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_u8
   vst2_u8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_u16
   vst2_u16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_u32
   vst2_u32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_u64
   vst2_u64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_s8(int8_t *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_s8
   vst2_s8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s16(int16_t *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_s16
   vst2_s16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s32(int32_t *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_s32
   vst2_s32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s64(int64_t *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_s64
   vst2_s64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f16(float16_t *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_f16
   vst2_f16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f32(float32_t *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_f32
   vst2_f32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f64(float64_t *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_f64
   vst2_f64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_p8
   vst2_p8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_p16
   vst2_p16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_u8
   vst3q_u8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_u16
   vst3q_u16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_u32
   vst3q_u32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_u64
   vst3q_u64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_s8
   vst3q_s8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_s16
   vst3q_s16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_s32
   vst3q_s32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_s64
   vst3q_s64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_f16
   vst3q_f16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_f32
   vst3q_f32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_f64
   vst3q_f64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_p8
   vst3q_p8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_p16
   vst3q_p16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_u8
   vst3_u8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_u16
   vst3_u16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_u32
   vst3_u32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_u64
   vst3_u64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_s8(int8_t *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_s8
   vst3_s8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s16(int16_t *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_s16
   vst3_s16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s32(int32_t *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_s32
   vst3_s32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s64(int64_t *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_s64
   vst3_s64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f16(float16_t *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_f16
   vst3_f16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f32(float32_t *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_f32
   vst3_f32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f64(float64_t *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_f64
   vst3_f64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_p8
   vst3_p8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_p16
   vst3_p16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_u8
   vst4q_u8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_u16
   vst4q_u16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_u32
   vst4q_u32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_u64
   vst4q_u64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_s8
   vst4q_s8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_s16
   vst4q_s16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_s32
   vst4q_s32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_s64
   vst4q_s64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_f16
   vst4q_f16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_f32
   vst4q_f32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f64.p0i8(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_f64
   vst4q_f64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_p8
   vst4q_p8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_p16
   vst4q_p16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_u8
   vst4_u8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_u16
   vst4_u16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_u32
   vst4_u32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_u64
   vst4_u64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_s8(int8_t *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_s8
   vst4_s8(a, b);
-// CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s16(int16_t *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_s16
   vst4_s16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s32(int32_t *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_s32
   vst4_s32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s64(int64_t *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_s64
   vst4_s64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f16(float16_t *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_f16
   vst4_f16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f32(float32_t *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_f32
   vst4_f32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1f64.p0i8(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f64(float64_t *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_f64
   vst4_f64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_p8
   vst4_p8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_p16
   vst4_p16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld1q_u8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x2
   return vld1q_u8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld1q_u16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x2
   return vld1q_u16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld1q_u32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x2
   return vld1q_u32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld1q_u64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x2
   return vld1q_u64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x2
   return vld1q_s8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x2
   return vld1q_s16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x2
   return vld1q_s32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x2
   return vld1q_s64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld1q_f16_x2(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x2
   return vld1q_f16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x2
   return vld1q_f32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x2
   return vld1q_f64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld1q_p8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x2
   return vld1q_p8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld1q_p16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x2
   return vld1q_p16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld1q_p64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x2
   return vld1q_p64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld1_u8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x2
   return vld1_u8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld1_u16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x2
   return vld1_u16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld1_u32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x2
   return vld1_u32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld1_u64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x2
   return vld1_u64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x2
   return vld1_s8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x2
   return vld1_s16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x2
   return vld1_s32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x2
   return vld1_s64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld1_f16_x2(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x2
   return vld1_f16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x2
   return vld1_f32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x2
   return vld1_f64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld1_p8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x2
   return vld1_p8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld1_p16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x2
   return vld1_p16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld1_p64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x2
   return vld1_p64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld1q_u8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x3
   return vld1q_u8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld1q_u16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x3
   return vld1q_u16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld1q_u32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x3
   return vld1q_u32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld1q_u64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x3
   return vld1q_u64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x3
   return vld1q_s8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x3
   return vld1q_s16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x3
   return vld1q_s32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x3
   return vld1q_s64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld1q_f16_x3(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x3
   return vld1q_f16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x3
   return vld1q_f32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x3
   return vld1q_f64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld1q_p8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x3
   return vld1q_p8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld1q_p16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x3
   return vld1q_p16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld1q_p64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x3
   return vld1q_p64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld1_u8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x3
   return vld1_u8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld1_u16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x3
   return vld1_u16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld1_u32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x3
   return vld1_u32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld1_u64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x3
   return vld1_u64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x3
   return vld1_s8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x3
   return vld1_s16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x3
   return vld1_s32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x3
   return vld1_s64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld1_f16_x3(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x3
   return vld1_f16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x3
   return vld1_f32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x3
   return vld1_f64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld1_p8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x3
   return vld1_p8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld1_p16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x3
   return vld1_p16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld1_p64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x3
   return vld1_p64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld1q_u8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x4
   return vld1q_u8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld1q_u16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x4
   return vld1q_u16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld1q_u32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x4
   return vld1q_u32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld1q_u64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x4
   return vld1q_u64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x4
   return vld1q_s8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x4
   return vld1q_s16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x4
   return vld1q_s32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x4
   return vld1q_s64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld1q_f16_x4(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x4
   return vld1q_f16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x4
   return vld1q_f32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x4
   return vld1q_f64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld1q_p8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x4
   return vld1q_p8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld1q_p16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x4
   return vld1q_p16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld1q_p64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x4
   return vld1q_p64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld1_u8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x4
   return vld1_u8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld1_u16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x4
   return vld1_u16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld1_u32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x4
   return vld1_u32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld1_u64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x4
   return vld1_u64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x4
   return vld1_s8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x4
   return vld1_s16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x4
   return vld1_s32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x4
   return vld1_s64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld1_f16_x4(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x4
   return vld1_f16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x4
   return vld1_f32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x4
   return vld1_f64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld1_p8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x4
   return vld1_p8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld1_p16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x4
   return vld1_p16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld1_p64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x4
   return vld1_p64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x2
   vst1q_u8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x2
   vst1q_u16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x2
   vst1q_u32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x2
   vst1q_u64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x2
   vst1q_s8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x2
   vst1q_s16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x2
   vst1q_s32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x2
   vst1q_s64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16_x2(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x2
   vst1q_f16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x2
   vst1q_f32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], double* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x2
   vst1q_f64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x2
   vst1q_p8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x2
   vst1q_p16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x2
   vst1q_p64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_u8_x2
   vst1_u8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_u16_x2
   vst1_u16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_u32_x2
   vst1_u32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_u64_x2
   vst1_u64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_s8_x2
   vst1_s8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_s16_x2
   vst1_s16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_s32_x2
   vst1_s32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_s64_x2
   vst1_s64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16_x2(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_f16_x2
   vst1_f16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_f32_x2
   vst1_f32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], double* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_f64_x2
   vst1_f64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_p8_x2
   vst1_p8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_p16_x2
   vst1_p16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_p64_x2
   vst1_p64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x3
   vst1q_u8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x3
   vst1q_u16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i32* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x3
   vst1q_u32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x3
   vst1q_u64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x3
   vst1q_s8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x3
   vst1q_s16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i32* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x3
   vst1q_s32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x3
   vst1q_s64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16_x3(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x3
   vst1q_f16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, float* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x3
   vst1q_f32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, double* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x3
   vst1q_f64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x3
   vst1q_p8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x3
   vst1q_p16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x3
   vst1q_p64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_u8_x3
   vst1_u8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_u16_x3
   vst1_u16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i32* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_u32_x3
   vst1_u32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_u64_x3
   vst1_u64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_s8_x3
   vst1_s8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_s16_x3
   vst1_s16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i32* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_s32_x3
   vst1_s32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_s64_x3
   vst1_s64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16_x3(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_f16_x3
   vst1_f16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, float* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_f32_x3
   vst1_f32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, double* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_f64_x3
   vst1_f64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_p8_x3
   vst1_p8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_p16_x3
   vst1_p16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64* [[TMP1]]2)
+// CHECK:   ret void
 void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_p64_x3
   vst1_p64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x4
   vst1q_u8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x4
   vst1q_u16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i32* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x4
   vst1q_u32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x4
   vst1q_u64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x4
   vst1q_s8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x4
   vst1q_s16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i32* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x4
   vst1q_s32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x4
   vst1q_s64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16_x4(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x4
   vst1q_f16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, float* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x4
   vst1q_f32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, double* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x4
   vst1q_f64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x4
   vst1q_p8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x4
   vst1q_p16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x4
   vst1q_p64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_u8_x4
   vst1_u8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_u16_x4
   vst1_u16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i32* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_u32_x4
   vst1_u32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_u64_x4
   vst1_u64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_s8_x4
   vst1_s8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_s16_x4
   vst1_s16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i32* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_s32_x4
   vst1_s32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_s64_x4
   vst1_s64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16_x4(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_f16_x4
   vst1_f16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, float* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_f32_x4
   vst1_f32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, double* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_f64_x4
   vst1_f64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_p8_x4
   vst1_p8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_p16_x4
   vst1_p16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64* [[TMP1]]5)
+// CHECK:   ret void
 void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_p64_x4
   vst1_p64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define i64 @test_vceqd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vceqd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vceqd_s64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vceqd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vceqd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vceqd_u64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vceqd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vceqzd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZ_I]]
 int64_t test_vceqzd_s64(int64_t a) {
-// CHECK-LABEL: test_vceqzd_s64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vceqzd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vceqzd_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
+// CHECK:   [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZD_I]]
 int64_t test_vceqzd_u64(int64_t a) {
-// CHECK-LABEL: test_vceqzd_u64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vceqzd_u64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcged_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcged_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcged_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcged_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcged_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp uge i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcged_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcged_u64
-// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
     return (uint64_t)vcged_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgezd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, 0
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGEZ_I]]
 int64_t test_vcgezd_s64(int64_t a) {
-// CHECK-LABEL: test_vcgezd_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, #0x0|eor x0, x[0-9]+, x0, asr #63}}
   return (int64_t)vcgezd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcgtd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcgtd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcgtd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp ugt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcgtd_u64
-// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcgtd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtzd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, 0
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGTZ_I]]
 int64_t test_vcgtzd_s64(int64_t a) {
-// CHECK-LABEL: test_vcgtzd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vcgtzd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcled_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcled_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcled_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcled_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcled_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp ule i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcled_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcled_u64
-// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcled_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vclezd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, 0
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLEZ_I]]
 int64_t test_vclezd_s64(int64_t a) {
-// CHECK-LABEL: test_vclezd_s64
-// CHECK: {{cmle d[0-9]+, d[0-9]+, #0x0|cmp x0, #1}}
   return (int64_t)vclezd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcltd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcltd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcltd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcltd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcltd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp ult i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcltd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcltd_u64
-// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcltd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcltzd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, 0
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLTZ_I]]
 int64_t test_vcltzd_s64(int64_t a) {
-// CHECK-LABEL: test_vcltzd_s64
-// CHECK: {{cmlt d[0-9]+, d[0-9]+, #0x0|asr x0, x0, #63}}
   return (int64_t)vcltzd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vtstd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK:   ret i64 [[VTSTD_I]]
 int64_t test_vtstd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vtstd_s64
-// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}}
   return (int64_t)vtstd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vtstd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK:   ret i64 [[VTSTD_I]]
 uint64_t test_vtstd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vtstd_u64
-// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}}
   return (uint64_t)vtstd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vabsd_s64(i64 %a) #0 {
+// CHECK:   [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) #4
+// CHECK:   ret i64 [[VABSD_S64_I]]
 int64_t test_vabsd_s64(int64_t a) {
-// CHECK-LABEL: test_vabsd_s64
-// CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vabsd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqabsb_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqabsb_s8(int8_t a) {
-// CHECK-LABEL: test_vqabsb_s8
-// CHECK: sqabs {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vqabsb_s8(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqabsh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqabsh_s16(int16_t a) {
-// CHECK-LABEL: test_vqabsh_s16
-// CHECK: sqabs {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vqabsh_s16(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqabss_s32(i32 %a) #0 {
+// CHECK:   [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) #4
+// CHECK:   ret i32 [[VQABSS_S32_I]]
 int32_t test_vqabss_s32(int32_t a) {
-// CHECK-LABEL: test_vqabss_s32
-// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vqabss_s32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vqabsd_s64(i64 %a) #0 {
+// CHECK:   [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) #4
+// CHECK:   ret i64 [[VQABSD_S64_I]]
 int64_t test_vqabsd_s64(int64_t a) {
-// CHECK-LABEL: test_vqabsd_s64
-// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vqabsd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vnegd_s64(i64 %a) #0 {
+// CHECK:   [[VNEGD_I:%.*]] = sub i64 0, %a
+// CHECK:   ret i64 [[VNEGD_I]]
 int64_t test_vnegd_s64(int64_t a) {
-// CHECK-LABEL: test_vnegd_s64
-// CHECK: neg {{[xd][0-9]+}}, {{[xd][0-9]+}}
   return (int64_t)vnegd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqnegb_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqnegb_s8(int8_t a) {
-// CHECK-LABEL: test_vqnegb_s8
-// CHECK: sqneg {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vqnegb_s8(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqnegh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqnegh_s16(int16_t a) {
-// CHECK-LABEL: test_vqnegh_s16
-// CHECK: sqneg {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vqnegh_s16(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqnegs_s32(i32 %a) #0 {
+// CHECK:   [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) #4
+// CHECK:   ret i32 [[VQNEGS_S32_I]]
 int32_t test_vqnegs_s32(int32_t a) {
-// CHECK-LABEL: test_vqnegs_s32
-// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vqnegs_s32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vqnegd_s64(i64 %a) #0 {
+// CHECK:   [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) #4
+// CHECK:   ret i64 [[VQNEGD_S64_I]]
 int64_t test_vqnegd_s64(int64_t a) {
-// CHECK-LABEL: test_vqnegd_s64
-// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vqnegd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vuqaddb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vuqaddb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vuqaddb_s8
-// CHECK: suqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vuqaddb_s8(a, b);
 }
 
+// CHECK-LABEL: define i16 @test_vuqaddh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vuqaddh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vuqaddh_s16
-// CHECK: suqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vuqaddh_s16(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vuqadds_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VUQADDS_S32_I]]
 int32_t test_vuqadds_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vuqadds_s32
-// CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vuqadds_s32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vuqaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VUQADDD_S64_I]]
 int64_t test_vuqaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vuqaddd_s64
-// CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vuqaddd_s64(a, b);
 }
 
+// CHECK-LABEL: define i8 @test_vsqaddb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vsqaddb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vsqaddb_u8
-// CHECK: usqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (uint8_t)vsqaddb_u8(a, b);
 }
 
+// CHECK-LABEL: define i16 @test_vsqaddh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vsqaddh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vsqaddh_u16
-// CHECK: usqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (uint16_t)vsqaddh_u16(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vsqadds_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VSQADDS_U32_I]]
 uint32_t test_vsqadds_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vsqadds_u32
-// CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vsqadds_u32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vsqaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSQADDD_U64_I]]
 uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsqaddd_u64
-// CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vsqaddd_u64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) #4
+// CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
 
-// CHECK-ARM64-LABEL: test_vqdmlalh_s16
-// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}}
-// CHECK-ARM64: sqadd {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]]
   return (int32_t)vqdmlalh_s16(a, b, c);
 }
 
+// CHECK-LABEL: define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) #0 {
+// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
+// CHECK:   ret i64 [[VQDMLXL1_I]]
 int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
-// CHECK-LABEL: test_vqdmlals_s32
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmlals_s32(a, b, c);
 }
 
+// CHECK-LABEL: define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) #4
+// CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
 
-// CHECK-ARM64-LABEL: test_vqdmlslh_s16
-// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}}
-// CHECK-ARM64: sqsub {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]]
   return (int32_t)vqdmlslh_s16(a, b, c);
 }
 
+// CHECK-LABEL: define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) #0 {
+// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
+// CHECK:   ret i64 [[VQDMLXL1_I]]
 int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
-// CHECK-LABEL: test_vqdmlsls_s32
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmlsls_s32(a, b, c);
 }
 
+// CHECK-LABEL: define i32 @test_vqdmullh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP2]]
 int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqdmullh_s16
-// CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int32_t)vqdmullh_s16(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vqdmulls_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) #4
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqdmulls_s32
-// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmulls_s32(a, b);
 }
 
+// CHECK-LABEL: define i8 @test_vqmovunh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovunh_s16(int16_t a) {
-// CHECK-LABEL: test_vqmovunh_s16
-// CHECK: sqxtun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovunh_s16(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqmovuns_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovuns_s32(int32_t a) {
-// CHECK-LABEL: test_vqmovuns_s32
-// CHECK: sqxtun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovuns_s32(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqmovund_s64(i64 %a) #0 {
+// CHECK:   [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVUND_S64_I]]
 int32_t test_vqmovund_s64(int64_t a) {
-// CHECK-LABEL: test_vqmovund_s64
-// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovund_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqmovnh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovnh_s16(int16_t a) {
-// CHECK-LABEL: test_vqmovnh_s16
-// CHECK: sqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovnh_s16(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqmovns_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovns_s32(int32_t a) {
-// CHECK-LABEL: test_vqmovns_s32
-// CHECK: sqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovns_s32(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqmovnd_s64(i64 %a) #0 {
+// CHECK:   [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVND_S64_I]]
 int32_t test_vqmovnd_s64(int64_t a) {
-// CHECK-LABEL: test_vqmovnd_s64
-// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovnd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqmovnh_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovnh_u16(int16_t a) {
-// CHECK-LABEL: test_vqmovnh_u16
-// CHECK: uqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovnh_u16(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqmovns_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovns_u32(int32_t a) {
-// CHECK-LABEL: test_vqmovns_u32
-// CHECK: uqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovns_u32(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqmovnd_u64(i64 %a) #0 {
+// CHECK:   [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVND_U64_I]]
 int32_t test_vqmovnd_u64(int64_t a) {
-// CHECK-LABEL: test_vqmovnd_u64
-// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovnd_u64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vceqs_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vceqs_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vceqs_f32
-// CHECK: {{fcmeq s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vceqs_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vceqd_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vceqd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vceqd_f64
-// CHECK: {{fcmeq d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vceqd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vceqzs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCEQZ_I]]
 uint32_t test_vceqzs_f32(float32_t a) {
-// CHECK-LABEL: test_vceqzs_f32
-// CHECK: {{fcmeq s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vceqzs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vceqzd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZ_I]]
 uint64_t test_vceqzd_f64(float64_t a) {
-// CHECK-LABEL: test_vceqzd_f64
-// CHECK: {{fcmeq d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vceqzd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcges_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcges_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcges_f32
-// CHECK: {{fcmge s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vcges_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcged_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcged_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcged_f64
-// CHECK: {{fcmge d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vcged_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcgezs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCGEZ_I]]
 uint32_t test_vcgezs_f32(float32_t a) {
-// CHECK-LABEL: test_vcgezs_f32
-// CHECK: {{fcmge s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcgezs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcgezd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGEZ_I]]
 uint64_t test_vcgezd_f64(float64_t a) {
-// CHECK-LABEL: test_vcgezd_f64
-// CHECK: {{fcmge d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcgezd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcgts_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcgts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcgts_f32
-// CHECK: {{fcmgt s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vcgts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtd_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcgtd_f64
-// CHECK: {{fcmgt d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vcgtd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcgtzs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCGTZ_I]]
 uint32_t test_vcgtzs_f32(float32_t a) {
-// CHECK-LABEL: test_vcgtzs_f32
-// CHECK: {{fcmgt s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcgtzs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtzd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGTZ_I]]
 uint64_t test_vcgtzd_f64(float64_t a) {
-// CHECK-LABEL: test_vcgtzd_f64
-// CHECK: {{fcmgt d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcgtzd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcles_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcles_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcles_f32
-// CHECK: {{fcmge s0, s1, s0|fcmp s0, s1}}
   return (uint32_t)vcles_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcled_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcled_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcled_f64
-// CHECK: {{fcmge d0, d1, d0|fcmp d0, d1}}
   return (uint64_t)vcled_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vclezs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCLEZ_I]]
 uint32_t test_vclezs_f32(float32_t a) {
-// CHECK-LABEL: test_vclezs_f32
-// CHECK: {{fcmle s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vclezs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vclezd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLEZ_I]]
 uint64_t test_vclezd_f64(float64_t a) {
-// CHECK-LABEL: test_vclezd_f64
-// CHECK: {{fcmle d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vclezd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vclts_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vclts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vclts_f32
-// CHECK: {{fcmgt s0, s1, s0|fcmp s0, s1}}
   return (uint32_t)vclts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcltd_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcltd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcltd_f64
-// CHECK: {{fcmgt d0, d1, d0|fcmp d0, d1}}
   return (uint64_t)vcltd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcltzs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCLTZ_I]]
 uint32_t test_vcltzs_f32(float32_t a) {
-// CHECK-LABEL: test_vcltzs_f32
-// CHECK: {{fcmlt s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcltzs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcltzd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLTZ_I]]
 uint64_t test_vcltzd_f64(float64_t a) {
-// CHECK-LABEL: test_vcltzd_f64
-// CHECK: {{fcmlt d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcltzd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcages_f32(float %a, float %b) #0 {
+// CHECK:   [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) #4
+// CHECK:   ret i32 [[VCAGES_F32_I]]
 uint32_t test_vcages_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcages_f32
-// CHECK: facge s0, s0, s1
   return (uint32_t)vcages_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcaged_f64(double %a, double %b) #0 {
+// CHECK:   [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) #4
+// CHECK:   ret i64 [[VCAGED_F64_I]]
 uint64_t test_vcaged_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaged_f64
-// CHECK: facge d0, d0, d1
   return (uint64_t)vcaged_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcagts_f32(float %a, float %b) #0 {
+// CHECK:   [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) #4
+// CHECK:   ret i32 [[VCAGTS_F32_I]]
 uint32_t test_vcagts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcagts_f32
-// CHECK: facgt s0, s0, s1
   return (uint32_t)vcagts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcagtd_f64(double %a, double %b) #0 {
+// CHECK:   [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) #4
+// CHECK:   ret i64 [[VCAGTD_F64_I]]
 uint64_t test_vcagtd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcagtd_f64
-// CHECK: facgt d0, d0, d1
   return (uint64_t)vcagtd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcales_f32(float %a, float %b) #0 {
+// CHECK:   [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) #4
+// CHECK:   ret i32 [[VCALES_F32_I]]
 uint32_t test_vcales_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcales_f32
-// CHECK: facge s0, s1, s0
   return (uint32_t)vcales_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcaled_f64(double %a, double %b) #0 {
+// CHECK:   [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) #4
+// CHECK:   ret i64 [[VCALED_F64_I]]
 uint64_t test_vcaled_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaled_f64
-// CHECK: facge d0, d1, d0
   return (uint64_t)vcaled_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcalts_f32(float %a, float %b) #0 {
+// CHECK:   [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) #4
+// CHECK:   ret i32 [[VCALTS_F32_I]]
 uint32_t test_vcalts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcalts_f32
-// CHECK: facgt s0, s1, s0
   return (uint32_t)vcalts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcaltd_f64(double %a, double %b) #0 {
+// CHECK:   [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) #4
+// CHECK:   ret i64 [[VCALTD_F64_I]]
 uint64_t test_vcaltd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaltd_f64
-// CHECK: facgt d0, d1, d0
   return (uint64_t)vcaltd_f64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vshrd_n_s64(i64 %a) #0 {
+// CHECK:   [[SHRD_N:%.*]] = ashr i64 %a, 1
+// CHECK:   ret i64 [[SHRD_N]]
 int64_t test_vshrd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vshrd_n_s64
-// CHECK: {{sshr d[0-9]+, d[0-9]+, #1|asr x0, x0, #1}}
   return (int64_t)vshrd_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 int64x1_t test_vshr_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vshr_n_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vshr_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vshrd_n_u64(i64 %a) #0 {
+// CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64(uint64_t a) {
 
-// CHECK-ARM64-LABEL: test_vshrd_n_u64
-// CHECK-ARM64: mov x0, xzr
   return (uint64_t)vshrd_n_u64(a, 64);
 }
 
+// CHECK-LABEL: define i64 @test_vshrd_n_u64_2() #0 {
+// CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64_2() {
 
-// CHECK-ARM64-LABEL: test_vshrd_n_u64_2
-// CHECK-ARM64: mov x0, xzr
   uint64_t a = UINT64_C(0xf000000000000000);
   return vshrd_n_u64(a, 64);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vshr_n_u64
-// CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vshr_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrshrd_n_s64(i64 %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63)
+// CHECK:   ret i64 [[VRSHR_N]]
 int64_t test_vrshrd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vrshrd_n_s64
-// CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vrshrd_n_s64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N]]1
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vrshr_n_s64
-// CHECK: srshr d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrshr_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrshrd_n_u64(i64 %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63)
+// CHECK:   ret i64 [[VRSHR_N]]
 uint64_t test_vrshrd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vrshrd_n_u64
-// CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vrshrd_n_u64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N]]1
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vrshr_n_u64
-// CHECK: urshr d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrshr_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrad_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[SHRD_N:%.*]] = ashr i64 %b, 63
+// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vsrad_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsrad_n_s64
-// CHECK: {{ssra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, asr #63}}
   return (int64_t)vsrad_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsra_n_s64
-// CHECK: ssra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsra_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrad_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[SHRD_N:%.*]] = lshr i64 %b, 63
+// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsrad_n_u64
-// CHECK: {{usra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, lsr #63}}
   return (uint64_t)vsrad_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define i64 @test_vsrad_n_u64_2(i64 %a, i64 %b) #0 {
+// CHECK:   ret i64 %a
 uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) {
 
-// CHECK-ARM64-LABEL: test_vsrad_n_u64_2
-// CHECK-ARM64-NOT: add
   return (uint64_t)vsrad_n_u64(a, b, 64);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsra_n_u64
-// CHECK: usra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsra_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63)
+// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
+// CHECK:   ret i64 [[TMP1]]
 int64_t test_vrsrad_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vrsrad_n_s64
-// CHECK: {{srsra d[0-9]+, d[0-9]+, #63}}
   return (int64_t)vrsrad_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrsra_n_s64
-// CHECK: srsra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrsra_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63)
+// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
+// CHECK:   ret i64 [[TMP1]]
 uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vrsrad_n_u64
-// CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vrsrad_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vrsra_n_u64
-// CHECK: ursra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrsra_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vshld_n_s64(i64 %a) #0 {
+// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 1
+// CHECK:   ret i64 [[SHLD_N]]
 int64_t test_vshld_n_s64(int64_t a) {
-// CHECK-LABEL: test_vshld_n_s64
-// CHECK: {{shl d[0-9]+, d[0-9]+, #1|lsl x0, x0, #1}}
   return (int64_t)vshld_n_s64(a, 1);
 }
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 int64x1_t test_vshl_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vshl_n_s64
-// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vshl_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vshld_n_u64(i64 %a) #0 {
+// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 63
+// CHECK:   ret i64 [[SHLD_N]]
 uint64_t test_vshld_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vshld_n_u64
-// CHECK: {{shl d[0-9]+, d[0-9]+, #63|lsl x0, x0, #63}}
   return (uint64_t)vshld_n_u64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vshl_n_u64
-// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vshl_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshlb_n_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshlb_n_s8(int8_t a) {
-// CHECK-LABEL: test_vqshlb_n_s8
-// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (int8_t)vqshlb_n_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vqshlh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshlh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshlh_n_s16
-// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (int16_t)vqshlh_n_s16(a, 15);
 }
 
+// CHECK-LABEL: define i32 @test_vqshls_n_s32(i32 %a) #0 {
+// CHECK:   [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLS_N_S32]]
 int32_t test_vqshls_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshls_n_s32
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (int32_t)vqshls_n_s32(a, 31);
 }
 
+// CHECK-LABEL: define i64 @test_vqshld_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHL_N]]
 int64_t test_vqshld_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshld_n_s64
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vqshld_n_s64(a, 63);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqshl_n_s8
   return vqshl_n_s8(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s8
   return vqshlq_n_s8(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
+// CHECK:   ret <4 x i16> [[VQSHL_N]]1
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqshl_n_s16
   return vqshl_n_s16(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
+// CHECK:   ret <8 x i16> [[VQSHL_N]]1
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s16
   return vqshlq_n_s16(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
+// CHECK:   ret <2 x i32> [[VQSHL_N]]1
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqshl_n_s32
   return vqshl_n_s32(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
+// CHECK:   ret <4 x i32> [[VQSHL_N]]1
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s32
   return vqshlq_n_s32(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
+// CHECK:   ret <2 x i64> [[VQSHL_N]]1
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s64
   return vqshlq_n_s64(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vqshl_n_u8
   return vqshl_n_u8(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u8
   return vqshlq_n_u8(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
+// CHECK:   ret <4 x i16> [[VQSHL_N]]1
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vqshl_n_u16
   return vqshl_n_u16(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
+// CHECK:   ret <8 x i16> [[VQSHL_N]]1
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u16
   return vqshlq_n_u16(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
+// CHECK:   ret <2 x i32> [[VQSHL_N]]1
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vqshl_n_u32
   return vqshl_n_u32(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
+// CHECK:   ret <4 x i32> [[VQSHL_N]]1
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u32
   return vqshlq_n_u32(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
+// CHECK:   ret <2 x i64> [[VQSHL_N]]1
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u64
   return vqshlq_n_u64(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N]]1
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: sqshl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshl_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshlb_n_u8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqshlb_n_u8(uint8_t a) {
-// CHECK-LABEL: test_vqshlb_n_u8
-// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (uint8_t)vqshlb_n_u8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vqshlh_n_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqshlh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqshlh_n_u16
-// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (uint16_t)vqshlh_n_u16(a, 15);
 }
 
+// CHECK-LABEL: define i32 @test_vqshls_n_u32(i32 %a) #0 {
+// CHECK:   [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLS_N_U32]]
 uint32_t test_vqshls_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqshls_n_u32
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (uint32_t)vqshls_n_u32(a, 31);
 }
 
+// CHECK-LABEL: define i64 @test_vqshld_n_u64(i64 %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHL_N]]
 uint64_t test_vqshld_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqshld_n_u64
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vqshld_n_u64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N]]1
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: uqshl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshl_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshlub_n_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshlub_n_s8(int8_t a) {
-// CHECK-LABEL: test_vqshlub_n_s8
-// CHECK: sqshlu {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (int8_t)vqshlub_n_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vqshluh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshluh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshluh_n_s16
-// CHECK: sqshlu {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (int16_t)vqshluh_n_s16(a, 15);
 }
 
+// CHECK-LABEL: define i32 @test_vqshlus_n_s32(i32 %a) #0 {
+// CHECK:   [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLUS_N_S32]]
 int32_t test_vqshlus_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshlus_n_s32
-// CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (int32_t)vqshlus_n_s32(a, 31);
 }
 
+// CHECK-LABEL: define i64 @test_vqshlud_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHLU_N]]
 int64_t test_vqshlud_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshlud_n_s64
-// CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vqshlud_n_s64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N]]1
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: sqshlu d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshlu_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrid_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S64]]1, i32 63)
+// CHECK:   [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S64]]2 to i64
+// CHECK:   ret i64 [[VSRID_N_S64]]3
 int64_t test_vsrid_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsrid_n_s64
-// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vsrid_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N]]2
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsri_n_s64
-// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsri_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrid_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U64]]1, i32 63)
+// CHECK:   [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U64]]2 to i64
+// CHECK:   ret i64 [[VSRID_N_U64]]3
 uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsrid_n_u64
-// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vsrid_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N]]2
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsri_n_u64
-// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsri_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vslid_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S64]]1, i32 63)
+// CHECK:   [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S64]]2 to i64
+// CHECK:   ret i64 [[VSLID_N_S64]]3
 int64_t test_vslid_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vslid_n_s64
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vslid_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N]]2
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsli_n_s64
-// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsli_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vslid_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U64]]1, i32 63)
+// CHECK:   [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U64]]2 to i64
+// CHECK:   ret i64 [[VSLID_N_U64]]3
 uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vslid_n_u64
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vslid_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N]]2
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsli_n_u64
-// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsli_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshrnh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshrnh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshrnh_n_s16
-// CHECK: sqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqshrnh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqshrns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshrns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshrns_n_s32
-// CHECK: sqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqshrns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqshrnd_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRND_N_S64]]
 int32_t test_vqshrnd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshrnd_n_s64
-// CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqshrnd_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqshrnh_n_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqshrnh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqshrnh_n_u16
-// CHECK: uqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (uint8_t)vqshrnh_n_u16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqshrns_n_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqshrns_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqshrns_n_u32
-// CHECK: uqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (uint16_t)vqshrns_n_u32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqshrnd_n_u64(i64 %a) #0 {
+// CHECK:   [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRND_N_U64]]
 uint32_t test_vqshrnd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqshrnd_n_u64
-// CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (uint32_t)vqshrnd_n_u64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqrshrnh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqrshrnh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqrshrnh_n_s16
-// CHECK: sqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqrshrnh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqrshrns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqrshrns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqrshrns_n_s32
-// CHECK: sqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqrshrns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqrshrnd_n_s64(i64 %a) #0 {
+// CHECK:   [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRND_N_S64]]
 int32_t test_vqrshrnd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqrshrnd_n_s64
-// CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqrshrnd_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqrshrnh_n_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqrshrnh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqrshrnh_n_u16
-// CHECK: uqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (uint8_t)vqrshrnh_n_u16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqrshrns_n_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqrshrns_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqrshrns_n_u32
-// CHECK: uqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (uint16_t)vqrshrns_n_u32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqrshrnd_n_u64(i64 %a) #0 {
+// CHECK:   [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRND_N_U64]]
 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqrshrnd_n_u64
-// CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (uint32_t)vqrshrnd_n_u64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqshrunh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshrunh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshrunh_n_s16
-// CHECK: sqshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqshrunh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqshruns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshruns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshruns_n_s32
-// CHECK: sqshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqshruns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqshrund_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRUND_N_S64]]
 int32_t test_vqshrund_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshrund_n_s64
-// CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqshrund_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqrshrunh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqrshrunh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqrshrunh_n_s16
-// CHECK: sqrshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqrshrunh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqrshruns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqrshruns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqrshruns_n_s32
-// CHECK: sqrshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqrshruns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqrshrund_n_s64(i64 %a) #0 {
+// CHECK:   [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRUND_N_S64]]
 int32_t test_vqrshrund_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqrshrund_n_s64
-// CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqrshrund_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define float @test_vcvts_n_f32_s32(i32 %a) #0 {
+// CHECK:   [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1)
+// CHECK:   ret float [[VCVTS_N_F32_S32]]
 float32_t test_vcvts_n_f32_s32(int32_t a) {
-// CHECK-LABEL: test_vcvts_n_f32_s32
-// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
   return vcvts_n_f32_s32(a, 1);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_n_f64_s64(i64 %a) #0 {
+// CHECK:   [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1)
+// CHECK:   ret double [[VCVTD_N_F64_S64]]
 float64_t test_vcvtd_n_f64_s64(int64_t a) {
-// CHECK-LABEL: test_vcvtd_n_f64_s64
-// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vcvtd_n_f64_s64(a, 1);
 }
 
+// CHECK-LABEL: define float @test_vcvts_n_f32_u32(i32 %a) #0 {
+// CHECK:   [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32)
+// CHECK:   ret float [[VCVTS_N_F32_U32]]
 float32_t test_vcvts_n_f32_u32(uint32_t a) {
-// CHECK-LABEL: test_vcvts_n_f32_u32
-// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #32
   return vcvts_n_f32_u32(a, 32);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_n_f64_u64(i64 %a) #0 {
+// CHECK:   [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64)
+// CHECK:   ret double [[VCVTD_N_F64_U64]]
 float64_t test_vcvtd_n_f64_u64(uint64_t a) {
-// CHECK-LABEL: test_vcvtd_n_f64_u64
-// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #64
   return vcvtd_n_f64_u64(a, 64);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_n_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1)
+// CHECK:   ret i32 [[VCVTS_N_S32_F32]]
 int32_t test_vcvts_n_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_n_s32_f32
-// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
   return (int32_t)vcvts_n_s32_f32(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_n_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1)
+// CHECK:   ret i64 [[VCVTD_N_S64_F64]]
 int64_t test_vcvtd_n_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_n_s64_f64
-// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
   return (int64_t)vcvtd_n_s64_f64(a, 1);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_n_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32)
+// CHECK:   ret i32 [[VCVTS_N_U32_F32]]
 uint32_t test_vcvts_n_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_n_u32_f32
-// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
   return (uint32_t)vcvts_n_u32_f32(a, 32);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_n_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64)
+// CHECK:   ret i64 [[VCVTD_N_U64_F64]]
 uint64_t test_vcvtd_n_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_n_u64_f64
-// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
   return (uint64_t)vcvtd_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   return vreinterpret_s8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   return vreinterpret_s8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   return vreinterpret_s16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   return vreinterpret_s16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   return vreinterpret_s32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   return vreinterpret_s32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   return vreinterpret_s64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) {
   return vreinterpret_s64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   return vreinterpret_u8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   return vreinterpret_u8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   return vreinterpret_u16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   return vreinterpret_u16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   return vreinterpret_u32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   return vreinterpret_u32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   return vreinterpret_u64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
   return vreinterpret_u64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   return vreinterpret_f16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   return vreinterpret_f16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   return vreinterpret_f32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   return vreinterpret_f32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   return vreinterpret_f64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   return vreinterpret_f64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   return vreinterpret_f64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   return vreinterpret_f64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   return vreinterpret_f64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   return vreinterpret_f64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   return vreinterpret_f64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   return vreinterpret_f64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   return vreinterpret_f64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   return vreinterpret_f64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   return vreinterpret_f64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   return vreinterpret_f64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   return vreinterpret_f64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   return vreinterpret_p8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   return vreinterpret_p8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   return vreinterpret_p16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   return vreinterpret_p16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   return vreinterpret_p64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   return vreinterpret_p64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   return vreinterpret_p64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
   return vreinterpret_p64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   return vreinterpret_p64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   return vreinterpret_p64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   return vreinterpret_p64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
   return vreinterpret_p64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   return vreinterpret_p64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   return vreinterpret_p64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   return vreinterpret_p64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   return vreinterpret_p64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   return vreinterpret_p64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   return vreinterpretq_s8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   return vreinterpretq_s8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   return vreinterpretq_s16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   return vreinterpretq_s16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   return vreinterpretq_s32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   return vreinterpretq_s32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   return vreinterpretq_s64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) {
   return vreinterpretq_s64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   return vreinterpretq_u8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   return vreinterpretq_u8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   return vreinterpretq_u16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   return vreinterpretq_u16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   return vreinterpretq_u32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   return vreinterpretq_u32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   return vreinterpretq_u64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
   return vreinterpretq_u64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   return vreinterpretq_f16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   return vreinterpretq_f16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   return vreinterpretq_f32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   return vreinterpretq_f32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   return vreinterpretq_f64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   return vreinterpretq_f64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   return vreinterpretq_f64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   return vreinterpretq_f64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   return vreinterpretq_f64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   return vreinterpretq_f64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   return vreinterpretq_f64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   return vreinterpretq_f64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   return vreinterpretq_f64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   return vreinterpretq_f64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   return vreinterpretq_f64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   return vreinterpretq_f64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   return vreinterpretq_f64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   return vreinterpretq_p8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   return vreinterpretq_p8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   return vreinterpretq_p16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   return vreinterpretq_p16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   return vreinterpretq_p64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   return vreinterpretq_p64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   return vreinterpretq_p64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
   return vreinterpretq_p64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   return vreinterpretq_p64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   return vreinterpretq_p64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   return vreinterpretq_p64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
   return vreinterpretq_p64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   return vreinterpretq_p64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   return vreinterpretq_p64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   return vreinterpretq_p64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   return vreinterpretq_p64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   return vreinterpretq_p64_p16(a);
 }
 
+// CHECK-LABEL: define float @test_vabds_f32(float %a, float %b) #0 {
+// CHECK:   [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) #4
+// CHECK:   ret float [[VABDS_F32_I]]
 float32_t test_vabds_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vabds_f32
-// CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return vabds_f32(a, b);
 }
 
+// CHECK-LABEL: define double @test_vabdd_f64(double %a, double %b) #0 {
+// CHECK:   [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) #4
+// CHECK:   ret double [[VABDD_F64_I]]
 float64_t test_vabdd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vabdd_f64
-// CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   return vabdd_f64(a, b);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[VUQADD_I]], <1 x i64> [[VUQADD1_I]]) #4
+// CHECK:   ret <1 x i64> [[VUQADD2_I]]
 int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vuqadd_s64
   return vuqadd_s64(a, b);
-  // CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[VSQADD_I]], <1 x i64> [[VSQADD1_I]]) #4
+// CHECK:   ret <1 x i64> [[VSQADD2_I]]
 uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vsqadd_u64
   return vsqadd_u64(a, b);
-  // CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSQADD_I]]
 uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsqadd_u8
   return vsqadd_u8(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSQADD_I]]
 uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsqaddq_u8
   return vsqaddq_u8(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[VSQADD_I]], <4 x i16> [[VSQADD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VSQADD2_I]]
 uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsqadd_u16
   return vsqadd_u16(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[VSQADD_I]], <8 x i16> [[VSQADD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VSQADD2_I]]
 uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsqaddq_u16
   return vsqaddq_u16(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[VSQADD_I]], <2 x i32> [[VSQADD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VSQADD2_I]]
 uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsqadd_u32
   return vsqadd_u32(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[VSQADD_I]], <4 x i32> [[VSQADD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VSQADD2_I]]
 uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsqaddq_u32
   return vsqaddq_u32(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[VSQADD_I]], <2 x i64> [[VSQADD1_I]]) #4
+// CHECK:   ret <2 x i64> [[VSQADD2_I]]
 uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsqaddq_u64
   return vsqaddq_u64(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vabs_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[VABS_I]]) #4
+// CHECK:   ret <1 x i64> [[VABS1_I]]
 int64x1_t test_vabs_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vabs_s64
   return vabs_s64(a);
-  // CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqabs_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 int64x1_t test_vqabs_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vqabs_s64
   return vqabs_s64(a);
-  // CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqneg_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 int64x1_t test_vqneg_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vqneg_s64
   return vqneg_s64(a);
-  // CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vneg_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vneg_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vneg_s64
   return vneg_s64(a);
-  // CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vaddv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VADDV_F32_I]]
 float32_t test_vaddv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vaddv_f32
   return vaddv_f32(a);
-  // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define float @test_vaddvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VADDVQ_F32_I]]
 float32_t test_vaddvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_f32
   return vaddvq_f32(a);
-  // CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vaddvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VADDVQ_F64_I]]
 float64_t test_vaddvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_f64
   return vaddvq_f64(a);
-  // CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vmaxv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMAXV_F32_I]]
 float32_t test_vmaxv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_f32
   return vmaxv_f32(a);
-  // CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vmaxvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMAXVQ_F64_I]]
 float64_t test_vmaxvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vmaxvq_f64
   return vmaxvq_f64(a);
-  // CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vminv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMINV_F32_I]]
 float32_t test_vminv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vminv_f32
   return vminv_f32(a);
-  // CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vminvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMINVQ_F64_I]]
 float64_t test_vminvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vminvq_f64
   return vminvq_f64(a);
-  // CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define double @test_vmaxnmvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMAXNMVQ_F64_I]]
 float64_t test_vmaxnmvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f64
   return vmaxnmvq_f64(a);
-  // CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vmaxnmv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMAXNMV_F32_I]]
 float32_t test_vmaxnmv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vmaxnmv_f32
   return vmaxnmv_f32(a);
-  // CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vminnmvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMINNMVQ_F64_I]]
 float64_t test_vminnmvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vminnmvq_f64
   return vminnmvq_f64(a);
-  // CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vminnmv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMINNMV_F32_I]]
 float32_t test_vminnmv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vminnmv_f32
   return vminnmv_f32(a);
-  // CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vpaddq_s64
   return vpaddq_s64(a, b);
-  // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vpaddq_u64
   return vpaddq_u64(a, b);
-  // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i64 @test_vpaddd_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VPADDD_U64_I]]
 uint64_t test_vpaddd_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vpaddd_u64
   return vpaddd_u64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i64 @test_vaddvq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDVQ_S64_I]]
 int64_t test_vaddvq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_s64
   return vaddvq_s64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i64 @test_vaddvq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDVQ_U64_I]]
 uint64_t test_vaddvq_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_u64
   return vaddvq_u64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vadd_f64
   return vadd_f64(a, b);
-  // CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[MUL_I]]
 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmul_f64
   return vmul_f64(a, b);
-  // CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[DIV_I]]
 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vdiv_f64
   return vdiv_f64(a, b);
-  // CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
+// CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vmla_f64
   return vmla_f64(a, b, c);
-  // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vmls_f64
   return vmls_f64(a, b, c);
-  // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
+// CHECK:   ret <1 x double> [[TMP6]]
 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vfma_f64
   return vfma_f64(a, b, c);
-  // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP4:%.*]] = fsub <1 x double> <double -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[FMLS1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLS2_I:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLS_I]], <1 x double> [[TMP4]], <1 x double> [[FMLS1_I]]) #4
+// CHECK:   ret <1 x double> [[FMLS2_I]]
 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vfms_f64
   return vfms_f64(a, b, c);
-  // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vsub_f64
   return vsub_f64(a, b);
-  // CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]]) #4
+// CHECK:   ret <1 x double> [[VABD2_I]]
 float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vabd_f64
   return vabd_f64(a, b);
-  // CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[VMAX_I]], <1 x double> [[VMAX1_I]]) #4
+// CHECK:   ret <1 x double> [[VMAX2_I]]
 float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmax_f64
   return vmax_f64(a, b);
-// CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[VMIN_I]], <1 x double> [[VMIN1_I]]) #4
+// CHECK:   ret <1 x double> [[VMIN2_I]]
 float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmin_f64
   return vmin_f64(a, b);
-// CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[VMAXNM_I]], <1 x double> [[VMAXNM1_I]]) #4
+// CHECK:   ret <1 x double> [[VMAXNM2_I]]
 float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmaxnm_f64
   return vmaxnm_f64(a, b);
-// CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[VMINNM_I]], <1 x double> [[VMINNM1_I]]) #4
+// CHECK:   ret <1 x double> [[VMINNM2_I]]
 float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vminnm_f64
   return vminnm_f64(a, b);
-// CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vabs_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[VABS_I]]) #4
+// CHECK:   ret <1 x double> [[VABS1_I]]
 float64x1_t test_vabs_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vabs_f64
   return vabs_f64(a);
-  // CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vneg_f64(<1 x double> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %a
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vneg_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vneg_f64
   return vneg_f64(a);
-  // CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fptosi <1 x double> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_s64_f64
   return vcvt_s64_f64(a);
-  // CHECK: fcvtzs {{[xd][0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fptoui <1 x double> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_u64_f64
   return vcvt_u64_f64(a);
-  // CHECK: fcvtzu {{[xd][0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTN1_I]]
 int64x1_t test_vcvtn_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtn_s64_f64
   return vcvtn_s64_f64(a);
-  // CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTN1_I]]
 uint64x1_t test_vcvtn_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtn_u64_f64
   return vcvtn_u64_f64(a);
-  // CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTP1_I]]
 int64x1_t test_vcvtp_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtp_s64_f64
   return vcvtp_s64_f64(a);
-  // CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTP1_I]]
 uint64x1_t test_vcvtp_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtp_u64_f64
   return vcvtp_u64_f64(a);
-  // CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTM1_I]]
 int64x1_t test_vcvtm_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtm_s64_f64
   return vcvtm_s64_f64(a);
-  // CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTM1_I]]
 uint64x1_t test_vcvtm_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtm_u64_f64
   return vcvtm_u64_f64(a);
-  // CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTA1_I]]
 int64x1_t test_vcvta_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvta_s64_f64
   return vcvta_s64_f64(a);
-  // CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTA1_I]]
 uint64x1_t test_vcvta_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvta_u64_f64
   return vcvta_u64_f64(a);
-  // CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double>
+// CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vcvt_f64_s64
   return vcvt_f64_s64(a);
-  // CHECK: scvtf d{{[0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double>
+// CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vcvt_f64_u64
   return vcvt_f64_u64(a);
-  // CHECK: ucvtf d{{[0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x i64> [[VCVT_N]]1
 int64x1_t test_vcvt_n_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_s64_f64
   return vcvt_n_s64_f64(a, 64);
-  // CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x i64> [[VCVT_N]]1
 uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_u64_f64
   return vcvt_n_u64_f64(a, 64);
-  // CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x double> [[VCVT_N]]1
 float64x1_t test_vcvt_n_f64_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_f64_s64
   return vcvt_n_f64_s64(a, 64);
-  // CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x double> [[VCVT_N]]1
 float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_f64_u64
   return vcvt_n_f64_u64(a, 64);
-  // CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndn_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> [[VRNDN_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDN1_I]]
 float64x1_t test_vrndn_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndn_f64
   return vrndn_f64(a);
-  // CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrnda_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDA1_I]]
 float64x1_t test_vrnda_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrnda_f64
   return vrnda_f64(a);
-  // CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndp_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDP1_I]]
 float64x1_t test_vrndp_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndp_f64
   return vrndp_f64(a);
-  // CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndm_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDM1_I]]
 float64x1_t test_vrndm_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndm_f64
   return vrndm_f64(a);
-  // CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndx_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDX1_I]]
 float64x1_t test_vrndx_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndx_f64
   return vrndx_f64(a);
-  // CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrnd_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDZ1_I]]
 float64x1_t test_vrnd_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrnd_f64
   return vrnd_f64(a);
-  // CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndi_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDI1_I]]
 float64x1_t test_vrndi_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndi_f64
   return vrndi_f64(a);
-  // CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrsqrte_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <1 x double> [[VRSQRTE_V1_I]]
 float64x1_t test_vrsqrte_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrsqrte_f64
   return vrsqrte_f64(a);
-  // CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrecpe_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[VRECPE_V_I]]) #4
+// CHECK:   ret <1 x double> [[VRECPE_V1_I]]
 float64x1_t test_vrecpe_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrecpe_f64
   return vrecpe_f64(a);
-  // CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vsqrt_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP1]]) #4
+// CHECK:   ret <1 x double> [[VSQRT_I]]
 float64x1_t test_vsqrt_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vsqrt_f64
   return vsqrt_f64(a);
-  // CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[VRECPS_V_I]], <1 x double> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <1 x double> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vrecps_f64
   return vrecps_f64(a, b);
-  // CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[VRSQRTS_V_I]], <1 x double> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vrsqrts_f64
   return vrsqrts_f64(a, b);
-  // CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define i32 @test_vminv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMINV_S32_I]]
 int32_t test_vminv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vminv_s32
   return vminv_s32(a);
-  // CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vminv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMINV_U32_I]]
 uint32_t test_vminv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vminv_u32
   return vminv_u32(a);
-  // CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vmaxv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMAXV_S32_I]]
 int32_t test_vmaxv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_s32
   return vmaxv_s32(a);
-  // CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vmaxv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMAXV_U32_I]]
 uint32_t test_vmaxv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_u32
   return vmaxv_u32(a);
-  // CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vaddv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VADDV_S32_I]]
 int32_t test_vaddv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vaddv_s32
   return vaddv_s32(a);
-  // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vaddv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VADDV_U32_I]]
 uint32_t test_vaddv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vaddv_u32
   return vaddv_u32(a);
-  // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i64 @test_vaddlv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDLV_S32_I]]
 int64_t test_vaddlv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vaddlv_s32
   return vaddlv_s32(a);
-  // CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i64 @test_vaddlv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDLV_U32_I]]
 uint64_t test_vaddlv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vaddlv_u32
   return vaddlv_u32(a);
-  // CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
 }
Index: test/CodeGen/aarch64-neon-ldst-one.c
===================================================================
--- test/CodeGen/aarch64-neon-ldst-one.c
+++ test/CodeGen/aarch64-neon-ldst-one.c
@@ -1,2049 +1,7977 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u8
   return vld1q_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u16
   return vld1q_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u32
   return vld1q_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u64
   return vld1q_dup_u64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s8
   return vld1q_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s16
   return vld1q_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s32
   return vld1q_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s64
   return vld1q_dup_s64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f16
   return vld1q_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f32
   return vld1q_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[LANE]]
 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f64
   return vld1q_dup_f64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p8
   return vld1q_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p16
   return vld1q_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p64
   return vld1q_dup_p64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u8
   return vld1_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u16
   return vld1_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u32
   return vld1_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u64
   return vld1_dup_u64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s8
   return vld1_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s16
   return vld1_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s32
   return vld1_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s64
   return vld1_dup_s64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f16
   return vld1_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f32
   return vld1_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[LANE]]
 float64x1_t test_vld1_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f64
   return vld1_dup_f64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p8
   return vld1_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p16
   return vld1_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p64
   return vld1_dup_p64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u8
   return vld2q_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u16
   return vld2q_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u32
   return vld2q_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u64
   return vld2q_dup_u64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s8
   return vld2q_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s16
   return vld2q_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s32
   return vld2q_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s64
   return vld2q_dup_s64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f16
   return vld2q_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f32
   return vld2q_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f64
   return vld2q_dup_f64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p8
   return vld2q_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p16
   return vld2q_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p64
   return vld2q_dup_p64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u8
   return vld2_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u16
   return vld2_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u32
   return vld2_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u64
   return vld2_dup_u64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s8
   return vld2_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s16
   return vld2_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s32
   return vld2_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s64
   return vld2_dup_s64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f16
   return vld2_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f32
   return vld2_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f64
   return vld2_dup_f64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p8
   return vld2_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p16
   return vld2_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p64
   return vld2_dup_p64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u8
   return vld3q_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u16
   return vld3q_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u32
   return vld3q_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u64
   return vld3q_dup_u64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s8
   return vld3q_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s16
   return vld3q_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s32
   return vld3q_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s64
   return vld3q_dup_s64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f16
   return vld3q_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f32
   return vld3q_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f64
   return vld3q_dup_f64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p8
   return vld3q_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p16
   return vld3q_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p64
   return vld3q_dup_p64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u8
   return vld3_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u16
   return vld3_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u32
   return vld3_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u64
   return vld3_dup_u64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s8
   return vld3_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s16
   return vld3_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s32
   return vld3_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s64
   return vld3_dup_s64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f16
   return vld3_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f32
   return vld3_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f64
   return vld3_dup_f64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p8
   return vld3_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p16
   return vld3_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p64
   return vld3_dup_p64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u8
   return vld4q_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u16
   return vld4q_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u32
   return vld4q_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u64
   return vld4q_dup_u64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s8
   return vld4q_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s16
   return vld4q_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s32
   return vld4q_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s64
   return vld4q_dup_s64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f16
   return vld4q_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f32
   return vld4q_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f64
   return vld4q_dup_f64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p8
   return vld4q_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p16
   return vld4q_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p64
   return vld4q_dup_p64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u8
   return vld4_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u16
   return vld4_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u32
   return vld4_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u64
   return vld4_dup_u64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s8
   return vld4_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s16
   return vld4_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s32
   return vld4_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s64
   return vld4_dup_s64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f16
   return vld4_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f32
   return vld4_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f64
   return vld4_dup_f64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p8
   return vld4_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p16
   return vld4_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p64
   return vld4_dup_p64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u8
   return vld1q_lane_u8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u16
   return vld1q_lane_u16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u32
   return vld1q_lane_u32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u64
   return vld1q_lane_u64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s8
   return vld1q_lane_s8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s16
   return vld1q_lane_s16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s32
   return vld1q_lane_s32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s64
   return vld1q_lane_s64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f16
   return vld1q_lane_f16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f32
   return vld1q_lane_f32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
+// CHECK:   ret <2 x double> [[VLD1_LANE]]
 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f64
   return vld1q_lane_f64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p8
   return vld1q_lane_p8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p16
   return vld1q_lane_p16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p64
   return vld1q_lane_p64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_u8
   return vld1_lane_u8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_u16
   return vld1_lane_u16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_u32
   return vld1_lane_u32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_u64
   return vld1_lane_u64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_s8
   return vld1_lane_s8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_s16
   return vld1_lane_s16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_s32
   return vld1_lane_s32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_s64
   return vld1_lane_s64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_f16
   return vld1_lane_f16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_f32
   return vld1_lane_f32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
+// CHECK:   ret <1 x double> [[VLD1_LANE]]
 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_f64
   return vld1_lane_f64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_p8
   return vld1_lane_p8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_p16
   return vld1_lane_p16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_p64
   return vld1_lane_p64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_s8
   return vld2q_lane_s8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b,  v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_u8
   return vld2q_lane_u8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_p8
   return vld2q_lane_p8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_s8
   return vld3q_lane_s8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_u8
   return vld3q_lane_u8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]3
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u16
   return vld2q_lane_u16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]3
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u32
   return vld2q_lane_u32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP1]]3
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u64
   return vld2q_lane_u64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]3
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s16
   return vld2q_lane_s16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]3
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s32
   return vld2q_lane_s32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP1]]3
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s64
   return vld2q_lane_s64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP1]]3
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f16
   return vld2q_lane_f16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]3
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f32
   return vld2q_lane_f32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP1]]3
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f64
   return vld2q_lane_f64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]3
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p16
   return vld2q_lane_p16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP1]]3
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p64
   return vld2q_lane_p64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u8
   return vld2_lane_u8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]3
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u16
   return vld2_lane_u16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]3
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u32
   return vld2_lane_u32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP1]]3
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u64
   return vld2_lane_u64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s8
   return vld2_lane_s8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]3
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s16
   return vld2_lane_s16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]3
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s32
   return vld2_lane_s32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP1]]3
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s64
   return vld2_lane_s64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP1]]3
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f16
   return vld2_lane_f16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]3
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f32
   return vld2_lane_f32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP1]]3
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f64
   return vld2_lane_f64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p8
   return vld2_lane_p8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]3
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p16
   return vld2_lane_p16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP1]]3
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p64
   return vld2_lane_p64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP1]]6
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u16
   return vld3q_lane_u16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP1]]6
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u32
   return vld3q_lane_u32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP1]]6
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u64
   return vld3q_lane_u64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP1]]6
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s16
   return vld3q_lane_s16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP1]]6
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s32
   return vld3q_lane_s32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP1]]6
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s64
   return vld3q_lane_s64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP1]]6
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f16
   return vld3q_lane_f16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP1]]6
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f32
   return vld3q_lane_f32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP1]]6
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f64
   return vld3q_lane_f64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p8
   return vld3q_lane_p8(a, b, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP1]]6
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p16
   return vld3q_lane_p16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP1]]6
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p64
   return vld3q_lane_p64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u8
   return vld3_lane_u8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP1]]6
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u16
   return vld3_lane_u16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP1]]6
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u32
   return vld3_lane_u32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP1]]6
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u64
   return vld3_lane_u64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s8
   return vld3_lane_s8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP1]]6
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s16
   return vld3_lane_s16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP1]]6
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s32
   return vld3_lane_s32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP1]]6
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s64
   return vld3_lane_s64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP1]]6
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f16
   return vld3_lane_f16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP1]]6
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f32
   return vld3_lane_f32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP1]]6
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f64
   return vld3_lane_f64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p8
   return vld3_lane_p8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP1]]6
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p16
   return vld3_lane_p16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP1]]6
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p64
   return vld3_lane_p64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP1]]0
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u8
   return vld4q_lane_u8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP1]]9
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u16
   return vld4q_lane_u16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP1]]9
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u32
   return vld4q_lane_u32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP1]]9
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u64
   return vld4q_lane_u64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP1]]0
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s8
   return vld4q_lane_s8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP1]]9
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s16
   return vld4q_lane_s16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP1]]9
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s32
   return vld4q_lane_s32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP1]]9
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s64
   return vld4q_lane_s64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP1]]9
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f16
   return vld4q_lane_f16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, <4 x float> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP1]]9
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f32
   return vld4q_lane_f32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, <2 x double> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP1]]9
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f64
   return vld4q_lane_f64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP1]]0
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p8
   return vld4q_lane_p8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP1]]9
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p16
   return vld4q_lane_p16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP1]]9
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p64
   return vld4q_lane_p64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP1]]0
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u8
   return vld4_lane_u8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP1]]9
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u16
   return vld4_lane_u16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP1]]9
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u32
   return vld4_lane_u32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP1]]9
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u64
   return vld4_lane_u64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP1]]0
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s8
   return vld4_lane_s8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP1]]9
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s16
   return vld4_lane_s16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP1]]9
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s32
   return vld4_lane_s32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP1]]9
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s64
   return vld4_lane_s64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP1]]9
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f16
   return vld4_lane_f16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, <2 x float> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP1]]9
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f32
   return vld4_lane_f32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, <1 x double> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP1]]9
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f64
   return vld4_lane_f64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP1]]0
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p8
   return vld4_lane_p8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP1]]9
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p16
   return vld4_lane_p16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP1]]9
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p64
   return vld4_lane_p64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u8
   vst1q_lane_u8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u16
   vst1q_lane_u16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u32
   vst1q_lane_u32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u64
   vst1q_lane_u64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s8
   vst1q_lane_s8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s16
   vst1q_lane_s16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s32
   vst1q_lane_s32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s64
   vst1q_lane_s64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f16
   vst1q_lane_f16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f32
   vst1q_lane_f32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f64
   vst1q_lane_f64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p8
   vst1q_lane_p8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p16
   vst1q_lane_p16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p64
   vst1q_lane_p64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_u8
   vst1_lane_u8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_u16
   vst1_lane_u16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_u32
   vst1_lane_u32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_u64
   vst1_lane_u64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_s8
   vst1_lane_s8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_s16
   vst1_lane_s16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_s32
   vst1_lane_s32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_s64
   vst1_lane_s64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_f16
   vst1_lane_f16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_f32
   vst1_lane_f32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_f64
   vst1_lane_f64(a, b, 0);
-  // CHECK: {{st1 { v[0-9]+.d }\[0]|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_p8
   vst1_lane_p8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_p16
   vst1_lane_p16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_p64
   vst1_lane_p64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u8
   vst2q_lane_u8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u16
   vst2q_lane_u16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u32
   vst2q_lane_u32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u64
   vst2q_lane_u64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s8
   vst2q_lane_s8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s16
   vst2q_lane_s16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s32
   vst2q_lane_s32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s64
   vst2q_lane_s64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f16
   vst2q_lane_f16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f32
   vst2q_lane_f32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f64
   vst2q_lane_f64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p8
   vst2q_lane_p8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p16
   vst2q_lane_p16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p64
   vst2q_lane_p64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u8
   vst2_lane_u8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u16
   vst2_lane_u16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u32
   vst2_lane_u32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u64
   vst2_lane_u64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s8
   vst2_lane_s8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s16
   vst2_lane_s16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s32
   vst2_lane_s32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s64
   vst2_lane_s64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f16
   vst2_lane_f16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f32
   vst2_lane_f32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f64
   vst2_lane_f64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p8
   vst2_lane_p8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p16
   vst2_lane_p16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p64
   vst2_lane_p64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u8
   vst3q_lane_u8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u16
   vst3q_lane_u16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u32
   vst3q_lane_u32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u64
   vst3q_lane_u64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s8
   vst3q_lane_s8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s16
   vst3q_lane_s16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s32
   vst3q_lane_s32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s64
   vst3q_lane_s64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f16
   vst3q_lane_f16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f32
   vst3q_lane_f32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f64
   vst3q_lane_f64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p8
   vst3q_lane_p8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p16
   vst3q_lane_p16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p64
   vst3q_lane_p64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u8
   vst3_lane_u8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u16
   vst3_lane_u16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u32
   vst3_lane_u32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u64
   vst3_lane_u64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s8
   vst3_lane_s8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s16
   vst3_lane_s16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s32
   vst3_lane_s32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s64
   vst3_lane_s64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f16
   vst3_lane_f16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f32
   vst3_lane_f32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f64
   vst3_lane_f64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p8
   vst3_lane_p8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p16
   vst3_lane_p16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p64
   vst3_lane_p64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u8
   vst4q_lane_u8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u16
   vst4q_lane_u16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u32
   vst4q_lane_u32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u64
   vst4q_lane_u64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s8
   vst4q_lane_s8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s16
   vst4q_lane_s16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s32
   vst4q_lane_s32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s64
   vst4q_lane_s64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f16
   vst4q_lane_f16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f32
   vst4q_lane_f32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f64
   vst4q_lane_f64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p8
   vst4q_lane_p8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p16
   vst4q_lane_p16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p64
   vst4q_lane_p64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u8
   vst4_lane_u8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u16
   vst4_lane_u16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u32
   vst4_lane_u32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u64
   vst4_lane_u64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s8
   vst4_lane_s8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s16
   vst4_lane_s16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s32
   vst4_lane_s32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s64
   vst4_lane_s64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f16
   vst4_lane_f16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f32
   vst4_lane_f32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f64
   vst4_lane_f64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p8
   vst4_lane_p8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p16
   vst4_lane_p16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p64
   vst4_lane_p64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
Index: test/CodeGen/aarch64-neon-misc.c
===================================================================
--- test/CodeGen/aarch64-neon-misc.c
+++ test/CodeGen/aarch64-neon-misc.c
@@ -1,2041 +1,3068 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vceqz_s8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
-// CHECK-LABEL: test_vceqz_s16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
-// CHECK-LABEL: test_vceqz_s32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vceqz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
-// CHECK-LABEL: test_vceqz_s64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_p64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
-// CHECK-LABEL: test_vceqz_u16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
-// CHECK-LABEL: test_vceqz_u32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vceqz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_f32
-// CHECK: fcmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vceqz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_f32(float32x2_t a) {
   return vceqz_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_f64
-// CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_f64(float64x1_t a) {
   return vceqz_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f32
-// CHECK: fcmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_f32(float32x4_t a) {
   return vceqzq_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_p8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
-// CHECK-LABEL: test_vceqz_p16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_p16(poly16x4_t a) {
   return vceqz_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_p16(poly16x8_t a) {
   return vceqzq_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f64
-// CHECK: fcmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_f64(float64x2_t a) {
   return vceqzq_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_p64(poly64x2_t a) {
   return vceqzq_p64(a);
 }
 
-// CHECK-LABEL: test_vcgez_s8
-// CHECK: cmge  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vcgez_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGEZ_I]]
 uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
-// CHECK-LABEL: test_vcgez_s16
-// CHECK: cmge  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vcgez_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGEZ_I]]
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
-// CHECK-LABEL: test_vcgez_s32
-// CHECK: cmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vcgez_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
-// CHECK-LABEL: test_vcgez_s64
-// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sge <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s8
-// CHECK: cmge  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vcgezq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGEZ_I]]
 uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s16
-// CHECK: cmge  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vcgezq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sge <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGEZ_I]]
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s32
-// CHECK: cmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vcgezq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s64
-// CHECK: cmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vcgezq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgez_f32
-// CHECK: fcmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcgez_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_f32(float32x2_t a) {
   return vcgez_f32(a);
 }
 
-// CHECK-LABEL: test_vcgez_f64
-// CHECK: fcmge  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcgez_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_f64(float64x1_t a) {
   return vcgez_f64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f32
-// CHECK: fcmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcgezq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_f32(float32x4_t a) {
   return vcgezq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f64
-// CHECK: fcmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcgezq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_f64(float64x2_t a) {
   return vcgezq_f64(a);
 }
 
-// CHECK-LABEL: test_vclez_s8
-// CHECK: cmle  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vclez_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLEZ_I]]
 uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
-// CHECK-LABEL: test_vclez_s16
-// CHECK: cmle  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vclez_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLEZ_I]]
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
-// CHECK-LABEL: test_vclez_s32
-// CHECK: cmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vclez_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
-// CHECK-LABEL: test_vclez_s64
-// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sle <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
-// CHECK-LABEL: test_vclezq_s8
-// CHECK: cmle  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vclezq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLEZ_I]]
 uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
-// CHECK-LABEL: test_vclezq_s16
-// CHECK: cmle  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vclezq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sle <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLEZ_I]]
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
-// CHECK-LABEL: test_vclezq_s32
-// CHECK: cmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vclezq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
-// CHECK-LABEL: test_vclezq_s64
-// CHECK: cmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vclezq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
 
-// CHECK-LABEL: test_vclez_f32
-// CHECK: fcmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vclez_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_f32(float32x2_t a) {
   return vclez_f32(a);
 }
 
-// CHECK-LABEL: test_vclez_f64
-// CHECK: fcmle  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vclez_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_f64(float64x1_t a) {
   return vclez_f64(a);
 }
 
-// CHECK-LABEL: test_vclezq_f32
-// CHECK: fcmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vclezq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_f32(float32x4_t a) {
   return vclezq_f32(a);
 }
 
-// CHECK-LABEL: test_vclezq_f64
-// CHECK: fcmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vclezq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_f64(float64x2_t a) {
   return vclezq_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s8
-// CHECK: cmgt  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vcgtz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGTZ_I]]
 uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s16
-// CHECK: cmgt  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vcgtz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGTZ_I]]
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s32
-// CHECK: cmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vcgtz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s64
-// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s8
-// CHECK: cmgt  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGTZ_I]]
 uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s16
-// CHECK: cmgt  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGTZ_I]]
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s32
-// CHECK: cmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s64
-// CHECK: cmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vcgtzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f32
-// CHECK: fcmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcgtz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_f32(float32x2_t a) {
   return vcgtz_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f64
-// CHECK: fcmgt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcgtz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_f64(float64x1_t a) {
   return vcgtz_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f32
-// CHECK: fcmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcgtzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_f32(float32x4_t a) {
   return vcgtzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f64
-// CHECK: fcmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcgtzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_f64(float64x2_t a) {
   return vcgtzq_f64(a);
 }
 
-// CHECK-LABEL: test_vcltz_s8
-// CHECK: sshr  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7
+// CHECK-LABEL: define <8 x i8> @test_vcltz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLTZ_I]]
 uint8x8_t test_vcltz_s8(int8x8_t a) {
   return vcltz_s8(a);
 }
 
-// CHECK-LABEL: test_vcltz_s16
-// CHECK: sshr  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+// CHECK-LABEL: define <4 x i16> @test_vcltz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLTZ_I]]
 uint16x4_t test_vcltz_s16(int16x4_t a) {
   return vcltz_s16(a);
 }
 
-// CHECK-LABEL: test_vcltz_s32
-// CHECK: sshr  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+// CHECK-LABEL: define <2 x i32> @test_vcltz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_s32(int32x2_t a) {
   return vcltz_s32(a);
 }
 
-// CHECK-LABEL: test_vcltz_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+// CHECK-LABEL: define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp slt <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_s64(int64x1_t a) {
   return vcltz_s64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s8
-// CHECK: sshr  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7
+// CHECK-LABEL: define <16 x i8> @test_vcltzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLTZ_I]]
 uint8x16_t test_vcltzq_s8(int8x16_t a) {
   return vcltzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s16
-// CHECK: sshr  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+// CHECK-LABEL: define <8 x i16> @test_vcltzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLTZ_I]]
 uint16x8_t test_vcltzq_s16(int16x8_t a) {
   return vcltzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s32
-// CHECK: sshr  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+// CHECK-LABEL: define <4 x i32> @test_vcltzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_s32(int32x4_t a) {
   return vcltzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s64
-// CHECK: sshr  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
+// CHECK-LABEL: define <2 x i64> @test_vcltzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_s64(int64x2_t a) {
   return vcltzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcltz_f32
-// CHECK: fcmlt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcltz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_f32(float32x2_t a) {
   return vcltz_f32(a);
 }
  
-// CHECK-LABEL: test_vcltz_f64
-// CHECK: fcmlt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcltz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_f64(float64x1_t a) {
   return vcltz_f64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f32
-// CHECK: fcmlt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcltzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_f32(float32x4_t a) {
   return vcltzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f64
-// CHECK: fcmlt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcltzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_f64(float64x2_t a) {
   return vcltzq_f64(a);
 }
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_s8
   return vpaddl_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_s16
   return vpaddl_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_s32
   return vpaddl_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_u8
   return vpaddl_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_u16
   return vpaddl_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_u32
   return vpaddl_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_s8
   return vpaddlq_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_s16
   return vpaddlq_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_s32
   return vpaddlq_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_u8
   return vpaddlq_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_u16
   return vpaddlq_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_u32
   return vpaddlq_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vpadal_s8
   return vpadal_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vpadal_s16
   return vpadal_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vpadal_s32
   return vpadal_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vpadal_u8
   return vpadal_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vpadal_u16
   return vpadal_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vpadal_u32
   return vpadal_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_s8
   return vpadalq_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_s16
   return vpadalq_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_s32
   return vpadalq_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_u8
   return vpadalq_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_u16
   return vpadalq_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_u32
   return vpadalq_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqabs_s8
   return vqabs_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqabsq_s8
   return vqabsq_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqabs_s16
   return vqabs_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqabsq_s16
   return vqabsq_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqabs_s32
   return vqabs_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqabsq_s32
   return vqabsq_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vqabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqabsq_s64
   return vqabsq_s64(a);
-  // CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqneg_s8
   return vqneg_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqnegq_s8
   return vqnegq_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqneg_s16
   return vqneg_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqnegq_s16
   return vqnegq_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqneg_s32
   return vqneg_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqnegq_s32
   return vqnegq_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vqnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqnegq_s64
   return vqnegq_s64(a);
-  // CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vneg_s8
   return vneg_s8(a);
-  // CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vnegq_s8
   return vnegq_s8(a);
-  // CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vneg_s16
   return vneg_s16(a);
-  // CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vnegq_s16
   return vnegq_s16(a);
-  // CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vneg_s32
   return vneg_s32(a);
-  // CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vnegq_s32
   return vnegq_s32(a);
-  // CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vnegq_s64
   return vnegq_s64(a);
-  // CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vneg_f32
   return vneg_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vnegq_f32
   return vnegq_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vnegq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vnegq_f64
   return vnegq_f64(a);
-  // CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vabs_s8
   return vabs_s8(a);
-  // CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vabsq_s8
   return vabsq_s8(a);
-  // CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #2
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vabs_s16
   return vabs_s16(a);
-  // CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #2
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vabsq_s16
   return vabsq_s16(a);
-  // CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #2
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vabs_s32
   return vabs_s32(a);
-  // CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #2
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vabsq_s32
   return vabsq_s32(a);
-  // CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[VABS_I]]) #2
+// CHECK:   ret <2 x i64> [[VABS1_I]]
 int64x2_t test_vabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vabsq_s64
   return vabsq_s64(a);
-  // CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #2
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vabs_f32
   return vabs_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #2
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vabsq_f32
   return vabsq_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vabsq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[VABS_I]]) #2
+// CHECK:   ret <2 x double> [[VABS1_I]]
 float64x2_t test_vabsq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vabsq_f64
   return vabsq_f64(a);
-  // CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VUQADD_I]]
 int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuqadd_s8
   return vuqadd_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VUQADD_I]]
 int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuqaddq_s8
   return vuqaddq_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) #2
+// CHECK:   ret <4 x i16> [[VUQADD2_I]]
 int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuqadd_s16
   return vuqadd_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) #2
+// CHECK:   ret <8 x i16> [[VUQADD2_I]]
 int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuqaddq_s16
   return vuqaddq_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) #2
+// CHECK:   ret <2 x i32> [[VUQADD2_I]]
 int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuqadd_s32
   return vuqadd_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) #2
+// CHECK:   ret <4 x i32> [[VUQADD2_I]]
 int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuqaddq_s32
   return vuqaddq_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) #2
+// CHECK:   ret <2 x i64> [[VUQADD2_I]]
 int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuqaddq_s64
   return vuqaddq_s64(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcls_s8
   return vcls_s8(a);
-  // CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclsq_s8
   return vclsq_s8(a);
-  // CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vcls_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vcls_s16
   return vcls_s16(a);
-  // CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclsq_s16
   return vclsq_s16(a);
-  // CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcls_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcls_s32
   return vcls_s32(a);
-  // CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclsq_s32
   return vclsq_s32(a);
-  // CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vclz_s8
   return vclz_s8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclzq_s8
   return vclzq_s8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vclz_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vclz_s16
   return vclz_s16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclzq_s16
   return vclzq_s16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vclz_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vclz_s32
   return vclz_s32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclzq_s32
   return vclzq_s32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vclz_u8
   return vclz_u8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vclzq_u8
   return vclzq_u8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vclz_u16
   return vclz_u16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vclzq_u16
   return vclzq_u16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vclz_u32
   return vclz_u32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vclzq_u32
   return vclzq_u32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcnt_s8
   return vcnt_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vcntq_s8
   return vcntq_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vcnt_u8
   return vcnt_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vcntq_u8
   return vcntq_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vcnt_p8
   return vcnt_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vcntq_p8
   return vcntq_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmvn_s8
   return vmvn_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_s8
   return vmvnq_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmvn_s16
   return vmvn_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_s16
   return vmvnq_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmvn_s32
   return vmvn_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_s32
   return vmvnq_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmvn_u8
   return vmvn_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_u8
   return vmvnq_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmvn_u16
   return vmvn_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_u16
   return vmvnq_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmvn_u32
   return vmvn_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_u32
   return vmvnq_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vmvn_p8
   return vmvn_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_p8
   return vmvnq_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 int8x8_t test_vrbit_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrbit_s8
   return vrbit_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 int8x16_t test_vrbitq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_s8
   return vrbitq_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 uint8x8_t test_vrbit_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vrbit_u8
   return vrbit_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 uint8x16_t test_vrbitq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_u8
   return vrbitq_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 poly8x8_t test_vrbit_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vrbit_p8
   return vrbit_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 poly8x16_t test_vrbitq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_p8
   return vrbitq_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmovn_s16
   return vmovn_s16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmovn_s32
   return vmovn_s32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vmovn_s64
   return vmovn_s64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmovn_u16
   return vmovn_u16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmovn_u32
   return vmovn_u32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vmovn_u64
   return vmovn_u64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_s16
   return vmovn_high_s16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_s32
   return vmovn_high_s32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_s64
   return vmovn_high_s64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_u16
   return vmovn_high_u16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_u32
   return vmovn_high_u32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_u64
   return vmovn_high_u64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 int8x8_t test_vqmovun_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovun_s16
   return vqmovun_s16(a);
-  // CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovun_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovun_s32
   return vqmovun_s32(a);
-  // CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovun_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovun_s64
   return vqmovun_s64(a);
-  // CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovun_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s16
   return vqmovun_high_s16(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovun_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s32
   return vqmovun_high_s32(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovun_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s64
   return vqmovun_high_s64(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_s16
   return vqmovn_s16(a);
-  // CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_s32
   return vqmovn_s32(a);
-  // CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_s64
   return vqmovn_s64(a);
-  // CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s16
   return vqmovn_high_s16(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s32
   return vqmovn_high_s32(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s64
   return vqmovn_high_s64(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_u16
   return vqmovn_u16(a);
-  // CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_u32
   return vqmovn_u32(a);
-  // CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_u64
   return vqmovn_u64(a);
-  // CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u16
   return vqmovn_high_u16(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u32
   return vqmovn_high_u32(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u64
   return vqmovn_high_u64(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #2
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_f16_f32
   return vcvt_f16_f32(a);
-  // CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x half> @test_vcvt_high_f16_f32(<4 x half> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) #2
+// CHECK:   [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I_I]]
 float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) {
-  //CHECK-LABEL: test_vcvt_high_f16_f32
   return vcvt_high_f16_f32(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f64
   return vcvt_f32_f64(a);
-  // CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvt_high_f32_f64
   return vcvt_high_f32_f64(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTX_F32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I]]) #2
+// CHECK:   ret <2 x float> [[VCVTX_F32_V1_I]]
 float32x2_t test_vcvtx_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtx_f32_f64
   return vcvtx_f32_f64(a);
-  // CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VCVTX_F32_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvtx_high_f32_f64
   return vcvtx_high_f32_f64(a, b);
-  // CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f16
   return vcvt_f32_f16(a);
-  // CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
-  //CHECK-LABEL: test_vcvt_high_f32_f16
   return vcvt_high_f32_f16(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvt_f64_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f64_f32
   return vcvt_f64_f32(a);
-  // CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I_I]]
 float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_high_f64_f32
   return vcvt_high_f64_f32(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> [[VRNDN_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDN1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndn_f32
   return vrndn_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> [[VRNDN_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDN1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndnq_f32
   return vrndnq_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> [[VRNDN_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDN1_I]]
 float64x2_t test_vrndnq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndnq_f64
   return vrndnq_f64(a);
-  // CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[VRNDA_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDA1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnda_f32
   return vrnda_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[VRNDA_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDA1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndaq_f32
   return vrndaq_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDA1_I]]
 float64x2_t test_vrndaq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndaq_f64
   return vrndaq_f64(a);
-  // CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[VRNDP_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDP1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndp_f32
   return vrndp_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VRNDP_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDP1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndpq_f32
   return vrndpq_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDP1_I]]
 float64x2_t test_vrndpq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndpq_f64
   return vrndpq_f64(a);
-  // CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[VRNDM_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDM1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndm_f32
   return vrndm_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[VRNDM_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDM1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndmq_f32
   return vrndmq_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDM1_I]]
 float64x2_t test_vrndmq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndmq_f64
   return vrndmq_f64(a);
-  // CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[VRNDX_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDX1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndx_f32
   return vrndx_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[VRNDX_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDX1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndxq_f32
   return vrndxq_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDX1_I]]
 float64x2_t test_vrndxq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndxq_f64
   return vrndxq_f64(a);
-  // CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[VRNDZ_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDZ1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnd_f32
   return vrnd_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VRNDZ_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDZ1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndq_f32
   return vrndq_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDZ1_I]]
 float64x2_t test_vrndq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndq_f64
   return vrndq_f64(a);
-  // CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDI1_I]]
 float32x2_t test_vrndi_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndi_f32
   return vrndi_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDI_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDI1_I]]
 float32x4_t test_vrndiq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndiq_f32
   return vrndiq_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[VRNDI_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDI1_I]]
 float64x2_t test_vrndiq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndiq_f64
   return vrndiq_f64(a);
-  // CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_s32_f32
   return vcvt_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_s32_f32
   return vcvtq_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_s64_f64
   return vcvtq_s64_f64(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_u32_f32
   return vcvt_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_u32_f32
   return vcvtq_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_u64_f64
   return vcvtq_u64_f64(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_s32_f32
   return vcvtn_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_s32_f32
   return vcvtnq_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 int64x2_t test_vcvtnq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_s64_f64
   return vcvtnq_s64_f64(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_u32_f32
   return vcvtn_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_u32_f32
   return vcvtnq_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_u64_f64
   return vcvtnq_u64_f64(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_s32_f32
   return vcvtp_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_s32_f32
   return vcvtpq_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 int64x2_t test_vcvtpq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_s64_f64
   return vcvtpq_s64_f64(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_u32_f32
   return vcvtp_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_u32_f32
   return vcvtpq_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_u64_f64
   return vcvtpq_u64_f64(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_s32_f32
   return vcvtm_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_s32_f32
   return vcvtmq_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 int64x2_t test_vcvtmq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_s64_f64
   return vcvtmq_s64_f64(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_u32_f32
   return vcvtm_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_u32_f32
   return vcvtmq_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_u64_f64
   return vcvtmq_u64_f64(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_s32_f32
   return vcvta_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_s32_f32
   return vcvtaq_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 int64x2_t test_vcvtaq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_s64_f64
   return vcvtaq_s64_f64(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_u32_f32
   return vcvta_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_u32_f32
   return vcvtaq_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_u64_f64
   return vcvtaq_u64_f64(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #2
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrsqrte_f32
   return vrsqrte_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f32
   return vrsqrteq_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   ret <2 x double> [[VRSQRTEQ_V1_I]]
 float64x2_t test_vrsqrteq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f64
   return vrsqrteq_f64(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #2
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_f32
   return vrecpe_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_f32
   return vrecpeq_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <2 x double> [[VRECPEQ_V1_I]]
 float64x2_t test_vrecpeq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrecpeq_f64
   return vrecpeq_f64(a);
-  // CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_u32
   return vrecpe_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_u32
   return vrecpeq_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[TMP1]]) #2
+// CHECK:   ret <2 x float> [[VSQRT_I]]
 float32x2_t test_vsqrt_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vsqrt_f32
   return vsqrt_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret <4 x float> [[VSQRT_I]]
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vsqrtq_f32
   return vsqrtq_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) #2
+// CHECK:   ret <2 x double> [[VSQRT_I]]
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vsqrtq_f64
   return vsqrtq_f64(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_s32
   return vcvt_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_u32
   return vcvt_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_s32
   return vcvtq_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_u32
   return vcvtq_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_s64(int64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_s64
   return vcvtq_f64_s64(a);
-  //CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_u64(uint64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_u64
   return vcvtq_f64_u64(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
Index: test/CodeGen/aarch64-neon-perm.c
===================================================================
--- test/CodeGen/aarch64-neon-perm.c
+++ test/CodeGen/aarch64-neon-perm.c
@@ -1,1092 +1,2279 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_s8
   return vuzp1_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_s8
   return vuzp1q_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_s16
   return vuzp1_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_s16
   return vuzp1q_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_s32
   return vuzp1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_s32
   return vuzp1q_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_s64
   return vuzp1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_u8
   return vuzp1_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_u8
   return vuzp1q_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_u16
   return vuzp1_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_u16
   return vuzp1q_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_u32
   return vuzp1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_u32
   return vuzp1q_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_u64
   return vuzp1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_f32
   return vuzp1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_f32
   return vuzp1q_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_f64
   return vuzp1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_p8
   return vuzp1_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_p8
   return vuzp1q_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_p16
   return vuzp1_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_p16
   return vuzp1q_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_s8
   return vuzp2_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_s8
   return vuzp2q_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_s16
   return vuzp2_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_s16
   return vuzp2q_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_s32
   return vuzp2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_s32
   return vuzp2q_s32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_s64
   return vuzp2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_u8
   return vuzp2_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_u8
   return vuzp2q_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_u16
   return vuzp2_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_u16
   return vuzp2q_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_u32
   return vuzp2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_u32
   return vuzp2q_u32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_u64
   return vuzp2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_f32
   return vuzp2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_f32
   return vuzp2q_f32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_f64
   return vuzp2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_p8
   return vuzp2_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_p8
   return vuzp2q_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_p16
   return vuzp2_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_p16
   return vuzp2q_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip1_s8
   return vzip1_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_s8
   return vzip1q_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip1_s16
   return vzip1_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_s16
   return vzip1q_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip1_s32
   return vzip1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_s32
   return vzip1q_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_s64
   return vzip1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip1_u8
   return vzip1_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_u8
   return vzip1q_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip1_u16
   return vzip1_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_u16
   return vzip1q_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip1_u32
   return vzip1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_u32
   return vzip1q_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_u64
   return vzip1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip1_f32
   return vzip1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_f32
   return vzip1q_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_f64
   return vzip1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip1_p8
   return vzip1_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_p8
   return vzip1q_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip1_p16
   return vzip1_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_p16
   return vzip1q_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip2_s8
   return vzip2_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_s8
   return vzip2q_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip2_s16
   return vzip2_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_s16
   return vzip2q_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip2_s32
   return vzip2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_s32
   return vzip2q_s32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_s64
   return vzip2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip2_u8
   return vzip2_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_u8
   return vzip2q_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip2_u16
   return vzip2_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_u16
   return vzip2q_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip2_u32
   return vzip2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_u32
   return vzip2q_u32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_u64
   return vzip2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip2_f32
   return vzip2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_f32
   return vzip2q_f32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_f64
   return vzip2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip2_p8
   return vzip2_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_p8
   return vzip2q_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip2_p16
   return vzip2_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_p16
   return vzip2q_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_s8
   return vtrn1_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_s8
   return vtrn1q_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_s16
   return vtrn1_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_s16
   return vtrn1q_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_s32
   return vtrn1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_s32
   return vtrn1q_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_s64
   return vtrn1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_u8
   return vtrn1_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_u8
   return vtrn1q_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_u16
   return vtrn1_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_u16
   return vtrn1q_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_u32
   return vtrn1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_u32
   return vtrn1q_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_u64
   return vtrn1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_f32
   return vtrn1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_f32
   return vtrn1q_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_f64
   return vtrn1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_p8
   return vtrn1_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_p8
   return vtrn1q_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_p16
   return vtrn1_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_p16
   return vtrn1q_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_s8
   return vtrn2_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_s8
   return vtrn2q_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_s16
   return vtrn2_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_s16
   return vtrn2q_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_s32
   return vtrn2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_s32
   return vtrn2q_s32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_s64
   return vtrn2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_u8
   return vtrn2_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_u8
   return vtrn2q_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_u16
   return vtrn2_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_u16
   return vtrn2q_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_u32
   return vtrn2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_u32
   return vtrn2q_u32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_u64
   return vtrn2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_f32
   return vtrn2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_f32
   return vtrn2q_f32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_f64
   return vtrn2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_p8
   return vtrn2_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_p8
   return vtrn2q_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_p16
   return vtrn2_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_p16
   return vtrn2q_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp_s8
   return vuzp_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp_s16
   return vuzp_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp_s32
   return vuzp_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp_u8
   return vuzp_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp_u16
   return vuzp_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp_u32
   return vuzp_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp_f32
   return vuzp_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp_p8
   return vuzp_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp_p16
   return vuzp_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_s8
   return vuzpq_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_s16
   return vuzpq_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_s32
   return vuzpq_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_u8
   return vuzpq_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_u16
   return vuzpq_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_u32
   return vuzpq_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_f32
   return vuzpq_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_p8
   return vuzpq_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_p16
   return vuzpq_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip_s8
   return vzip_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip_s16
   return vzip_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip_s32
   return vzip_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip_u8
   return vzip_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip_u16
   return vzip_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip_u32
   return vzip_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip_f32
   return vzip_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip_p8
   return vzip_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip_p16
   return vzip_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzipq_s8
   return vzipq_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzipq_s16
   return vzipq_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzipq_s32
   return vzipq_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzipq_u8
   return vzipq_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzipq_u16
   return vzipq_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzipq_u32
   return vzipq_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzipq_f32
   return vzipq_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzipq_p8
   return vzipq_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzipq_p16
   return vzipq_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn_s8
   return vtrn_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn_s16
   return vtrn_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn_s32
   return vtrn_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn_u8
   return vtrn_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn_u16
   return vtrn_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn_u32
   return vtrn_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn_f32
   return vtrn_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn_p8
   return vtrn_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn_p16
   return vtrn_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_s8
   return vtrnq_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_s16
   return vtrnq_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_s32
   return vtrnq_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_u8
   return vtrnq_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_u16
   return vtrnq_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_u32
   return vtrnq_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_f32
   return vtrnq_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_p8
   return vtrnq_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_p16
   return vtrnq_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
Index: test/CodeGen/aarch64-neon-scalar-copy.c
===================================================================
--- test/CodeGen/aarch64-neon-scalar-copy.c
+++ test/CodeGen/aarch64-neon-scalar-copy.c
@@ -1,173 +1,228 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vdups_lane_f32
+// CHECK-LABEL: define float @test_vdups_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VDUPS_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VDUPS_LANE]]
 float32_t test_vdups_lane_f32(float32x2_t a) {
   return vdups_lane_f32(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_f64
+// CHECK-LABEL: define double @test_vdupd_lane_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VDUPD_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   ret double [[VDUPD_LANE]]
 float64_t test_vdupd_lane_f64(float64x1_t a) {
   return vdupd_lane_f64(a, 0);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_f32
+// CHECK-LABEL: define float @test_vdups_laneq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vdups_laneq_f32(float32x4_t a) {
   return vdups_laneq_f32(a, 3);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_f64
+// CHECK-LABEL: define double @test_vdupd_laneq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   ret double [[VGETQ_LANE]]
 float64_t test_vdupd_laneq_f64(float64x2_t a) {
   return vdupd_laneq_f64(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_s8
+// CHECK-LABEL: define i8 @test_vdupb_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vdupb_lane_s8(int8x8_t a) {
   return vdupb_lane_s8(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_s16
+// CHECK-LABEL: define i16 @test_vduph_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vduph_lane_s16(int16x4_t a) {
   return vduph_lane_s16(a, 3);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_s32
+// CHECK-LABEL: define i32 @test_vdups_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vdups_lane_s32(int32x2_t a) {
   return vdups_lane_s32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_s64
+// CHECK-LABEL: define i64 @test_vdupd_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vdupd_lane_s64(int64x1_t a) {
   return vdupd_lane_s64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_u8
+// CHECK-LABEL: define i8 @test_vdupb_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vdupb_lane_u8(uint8x8_t a) {
   return vdupb_lane_u8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_u16
+// CHECK-LABEL: define i16 @test_vduph_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vduph_lane_u16(uint16x4_t a) {
   return vduph_lane_u16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_u32
+// CHECK-LABEL: define i32 @test_vdups_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vdups_lane_u32(uint32x2_t a) {
   return vdups_lane_u32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_u64
+// CHECK-LABEL: define i64 @test_vdupd_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vdupd_lane_u64(uint64x1_t a) {
   return vdupd_lane_u64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vdupb_laneq_s8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vdupb_laneq_s8(int8x16_t a) {
   return vdupb_laneq_s8(a, 15);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_s16
+// CHECK-LABEL: define i16 @test_vduph_laneq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vduph_laneq_s16(int16x8_t a) {
   return vduph_laneq_s16(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_s32
+// CHECK-LABEL: define i32 @test_vdups_laneq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vdups_laneq_s32(int32x4_t a) {
   return vdups_laneq_s32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_s64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vdupd_laneq_s64(int64x2_t a) {
   return vdupd_laneq_s64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_laneq_u8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vdupb_laneq_u8(uint8x16_t a) {
   return vdupb_laneq_u8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_u16
+// CHECK-LABEL: define i16 @test_vduph_laneq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vduph_laneq_u16(uint16x8_t a) {
   return vduph_laneq_u16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_u32
+// CHECK-LABEL: define i32 @test_vdups_laneq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vdups_laneq_u32(uint32x4_t a) {
   return vdups_laneq_u32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_u64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vdupd_laneq_u64(uint64x2_t a) {
   return vdupd_laneq_u64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vdupb_lane_p8
+// CHECK-LABEL: define i8 @test_vdupb_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vdupb_lane_p8(poly8x8_t a) {
   return vdupb_lane_p8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
-// CHECK-LABEL: test_vduph_lane_p16
+// CHECK-LABEL: define i16 @test_vduph_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vduph_lane_p16(poly16x4_t a) {
   return vduph_lane_p16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vdupb_laneq_p8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vdupb_laneq_p8(poly8x16_t a) {
   return vdupb_laneq_p8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
-// CHECK-LABEL: test_vduph_laneq_p16
+// CHECK-LABEL: define i16 @test_vduph_laneq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vduph_laneq_p16(poly16x8_t a) {
   return vduph_laneq_p16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
Index: test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
===================================================================
--- test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -1,259 +1,509 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
 
+// CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmuls_lane_f32
   return vmuls_lane_f32(a, b, 1);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmuld_lane_f64
   return vmuld_lane_f64(a, b, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmuls_laneq_f32
   return vmuls_laneq_f32(a, b, 3);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vmuld_laneq_f64
   return vmuld_laneq_f64(a, b, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
+// CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
+// CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
-  // CHECK-LABEL: test_vmul_n_f64
   return vmul_n_f64(a, b);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulxs_lane_f32
   return vmulxs_lane_f32(a, b, 1);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxs_laneq_f32
   return vmulxs_laneq_f32(a, b, 3);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulxd_lane_f64
   return vmulxd_lane_f64(a, b, 0);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxd_laneq_f64
   return vmulxd_laneq_f64(a, b, 1);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vmulx_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE]]6) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
   return vmulx_lane_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vmulx_laneq_f64_0
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f64_1
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vfmas_lane_f32
+// CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmas_lane_f32(a, b, c, 1);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vfmad_lane_f64
+// CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfmad_laneq_f64
+// CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
   return vfmad_laneq_f64(a, b, c, 1);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vfmss_lane_f32
+// CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vfma_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA]]1)
+// CHECK:   ret <1 x double> [[FMLA]]2
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfms_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA]]1)
+// CHECK:   ret <1 x double> [[FMLA]]2
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfma_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vfms_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vqdmullh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmullh_lane_s16(a, b, 3);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9].4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmulls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulls_lane_s32(a, b, 1);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmullh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmullh_laneq_s16(a, b, 7);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmulls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulls_laneq_s32(a, b, 3);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmulhh_lane_s16(a, b, 3);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulhs_lane_s32(a, b, 1);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vqdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vqdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqrdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqrdmulhh_lane_s16(a, b, 3);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqrdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqrdmulhs_lane_s32(a, b, 1);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vqrdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqrdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vqrdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqrdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmlalh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlalh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmlals_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlals_lane_s32(a, b, c, 1);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmlalh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlalh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmlals_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlals_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmlslh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlslh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmlsls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlsls_lane_s32(a, b, c, 1);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmlslh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlslh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmlsls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlsls_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vmulx_lane_f64_0:
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE]]7) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64_0() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -262,15 +512,24 @@
       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       result = vmulx_lane_f64(arg1, arg2, 0);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
       return result;
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f64_2:
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_2() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -281,10 +540,5 @@
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       arg3 = vcombine_f64(arg1, arg2);
       result = vmulx_laneq_f64(arg1, arg3, 1);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
       return result;
 }
Index: test/CodeGen/aarch64-neon-shifts.c
===================================================================
--- test/CodeGen/aarch64-neon-shifts.c
+++ test/CodeGen/aarch64-neon-shifts.c
@@ -1,6 +1,5 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN:   -ffp-contract=fast -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
@@ -25,19 +24,20 @@
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra
   // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_u8(a, b, 5);
 }
 
 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_smax
   // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_s8(a, b, 8);
 }
 
 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
   return vsra_n_u8(a, b, 8);
 }
Index: test/CodeGen/aarch64-neon-tbl.c
===================================================================
--- test/CodeGen/aarch64-neon-tbl.c
+++ test/CodeGen/aarch64-neon-tbl.c
@@ -1,463 +1,1500 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_s8
   return vtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 int8x8_t test_vqtbl1_s8(int8x16_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_s8
   return vqtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_s8
   return vtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 int8x8_t test_vqtbl2_s8(int8x16x2_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_s8
   return vqtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_s8
   return vtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 int8x8_t test_vqtbl3_s8(int8x16x3_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_s8
   return vqtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_s8
   return vtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 int8x8_t test_vqtbl4_s8(int8x16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_s8
   return vqtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_s8
   return vqtbl1q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_s8
   return vqtbl2q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_s8
   return vqtbl3q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_s8
   return vqtbl4q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_s8
   return vtbx1_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_s8
   return vtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_s8
   return vtbx3_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_s8
   return vtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_s8
   return vqtbx1_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_s8
   return vqtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_s8
   return vqtbx3_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_s8
   return vqtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_s8
   return vqtbx1q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_s8
   return vqtbx2q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_s8
   return vqtbx3q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_s8
   return vqtbx4q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_u8
   return vtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_u8
   return vqtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_u8
   return vtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_u8
   return vqtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_u8
   return vtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_u8
   return vqtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_u8
   return vtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_u8
   return vqtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_u8
   return vqtbl1q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_u8
   return vqtbl2q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_u8
   return vqtbl3q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_u8
   return vqtbl4q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_u8
   return vtbx1_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_u8
   return vtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_u8
   return vtbx3_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_u8
   return vtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_u8
   return vqtbx1_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_u8
   return vqtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_u8
   return vqtbx3_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_u8
   return vqtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_u8
   return vqtbx1q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_u8
   return vqtbx2q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_u8
   return vqtbx3q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_u8
   return vqtbx4q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_p8
   return vtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_p8
   return vqtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_p8
   return vtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_p8
   return vqtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_p8
   return vtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_p8
   return vqtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_p8
   return vtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_p8
   return vqtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_p8
   return vqtbl1q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_p8
   return vqtbl2q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_p8
   return vqtbl3q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_p8
   return vqtbl4q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_p8
   return vtbx1_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_p8
   return vtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_p8
   return vtbx3_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_p8
   return vtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_p8
   return vqtbx1_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_p8
   return vqtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_p8
   return vqtbx3_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_p8
   return vqtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_p8
   return vqtbx1q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_p8
   return vqtbx2q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_p8
   return vqtbx3q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_p8
   return vqtbx4q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
Index: test/CodeGen/aarch64-neon-vcombine.c
===================================================================
--- test/CodeGen/aarch64-neon-vcombine.c
+++ test/CodeGen/aarch64-neon-vcombine.c
@@ -1,90 +1,103 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t low, int8x8_t high) {
-  // CHECK-LABEL: test_vcombine_s8:
   return vcombine_s8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t low, int16x4_t high) {
-  // CHECK-LABEL: test_vcombine_s16:
   return vcombine_s16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t low, int32x2_t high) {
-  // CHECK-LABEL: test_vcombine_s32:
   return vcombine_s32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t low, int64x1_t high) {
-  // CHECK-LABEL: test_vcombine_s64:
   return vcombine_s64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t low, uint8x8_t high) {
-  // CHECK-LABEL: test_vcombine_u8:
   return vcombine_u8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t low, uint16x4_t high) {
-  // CHECK-LABEL: test_vcombine_u16:
   return vcombine_u16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t low, uint32x2_t high) {
-  // CHECK-LABEL: test_vcombine_u32:
   return vcombine_u32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t low, uint64x1_t high) {
-  // CHECK-LABEL: test_vcombine_u64:
   return vcombine_u64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64:
   return vcombine_p64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %low, <4 x half> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %low, <4 x half> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t low, float16x4_t high) {
-  // CHECK-LABEL: test_vcombine_f16:
   return vcombine_f16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %low, <2 x float> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %low, <2 x float> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t low, float32x2_t high) {
-  // CHECK-LABEL: test_vcombine_f32:
   return vcombine_f32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t low, poly8x8_t high) {
-  // CHECK-LABEL: test_vcombine_p8:
   return vcombine_p8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t low, poly16x4_t high) {
-  // CHECK-LABEL: test_vcombine_p16:
   return vcombine_p16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcombine_f64(<1 x double> %low, <1 x double> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> %low, <1 x double> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vcombine_f64(float64x1_t low, float64x1_t high) {
-  // CHECK-LABEL: test_vcombine_f64:
   return vcombine_f64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
Index: test/CodeGen/aarch64-neon-vget-hilo.c
===================================================================
--- test/CodeGen/aarch64-neon-vget-hilo.c
+++ test/CodeGen/aarch64-neon-vget-hilo.c
@@ -1,176 +1,203 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-ARM64
-
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s8:
   return vget_high_s8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s16:
   return vget_high_s16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s32:
   return vget_high_s32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s64:
   return vget_high_s64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u8:
   return vget_high_u8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u16:
   return vget_high_u16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u32:
   return vget_high_u32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u64:
   return vget_high_u64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_high_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p64:
   return vget_high_p64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f16:
   return vget_high_f16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f32:
   return vget_high_f32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p8:
   return vget_high_p8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p16
   return vget_high_p16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x double> @test_vget_high_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_high_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f64
   return vget_high_f64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s8:
   return vget_low_s8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s16:
   return vget_low_s16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s32:
   return vget_low_s32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s64:
   return vget_low_s64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u8:
   return vget_low_u8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u16:
   return vget_low_u16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u32:
   return vget_low_u32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u64:
   return vget_low_u64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_low_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p64:
   return vget_low_p64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f16:
   return vget_low_f16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f32:
   return vget_low_f32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p8:
   return vget_low_p8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p16:
   return vget_low_p16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x double> @test_vget_low_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_low_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f64:
   return vget_low_f64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
Index: test/CodeGen/aarch64-neon-vget.c
===================================================================
--- test/CodeGen/aarch64-neon-vget.c
+++ test/CodeGen/aarch64-neon-vget.c
@@ -1,348 +1,458 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_u8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_u16(a, 3);
 }
 
+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_u32(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_s16(a, 3);
 }
 
+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_s32(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_p8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_p16(a, 3);
 }
 
+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_f32(a, 1);
 }
 
+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[1]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
   return vget_lane_f16(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u16(a, 7);
 }
 
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u32(a, 3);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s16(a, 7);
 }
 
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s32(a, 3);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_p8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_p16(a, 7);
 }
 
+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_f32(a, 3);
 }
 
+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[3]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
   return vgetq_lane_f16(a, 3);
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_s64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
   return vget_lane_s64(a, 0);
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_u64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
   return vget_lane_u64(a, 0);
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u64(a, 1);
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_u8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_u16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_u32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u32(a, b, 1);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_s8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_s16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_s32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s32(a, b, 1);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_p8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_p8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_p16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_p16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_f32:
-  // CHECK-NEXT:  ins.s v1[1], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
   return vset_lane_f32(a, b, 1);
 }
 
+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[3], [x0]
-  // CHECK-NEXT:  ret
   return vset_lane_f16(*a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u32(a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s32(a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_p8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_p16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f32:
-  // CHECK-NEXT:  ins.s v1[3], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
   return vsetq_lane_f32(a, b, 3);
 }
 
+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 7
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[7], [x0]
-  // CHECK-NEXT:  ret
   return vsetq_lane_f16(*a, b, 7);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_s64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
   return vset_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_u64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
   return vset_lane_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u64(a, b, 1);
 }
Index: test/CodeGen/aarch64-poly128.c
===================================================================
--- test/CodeGen/aarch64-poly128.c
+++ test/CodeGen/aarch64-poly128.c
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
 
 // Test new aarch64 intrinsics with poly128
 // FIXME: Currently, poly128_t equals to uint128, which will be spilt into
@@ -12,192 +12,238 @@
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   store i128 %val, i128* [[TMP1]]
+// CHECK:   ret void
 void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
-  // CHECK-LABEL: test_vstrq_p128
   vstrq_p128(ptr, val);
 
-  // CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }
 
+// CHECK-LABEL: define i128 @test_vldrq_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   ret i128 [[TMP2]]
 poly128_t test_vldrq_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_vldrq_p128
   return vldrq_p128(ptr);
 
-  // CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }
 
+// CHECK-LABEL: define void @test_ld_st_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1
+// CHECK:   [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
+// CHECK:   store i128 [[TMP2]], i128* [[TMP4]]
+// CHECK:   ret void
 void test_ld_st_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_ld_st_p128
    vstrq_p128(ptr+1, vldrq_p128(ptr));
 
- // CHECK-ARM64: ldp [[PLO:x[0-9]+]], [[PHI:x[0-9]+]], [{{x[0-9]+}}]
- // CHECK-ARM64-NEXT: stp [[PLO]], [[PHI]], [{{x[0-9]+}}, #16]
 }
 
+// CHECK-LABEL: define i128 @test_vmull_p64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #2
+// CHECK:   [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I]]
 poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
-  // CHECK-LABEL: test_vmull_p64
   return vmull_p64(a, b);
-  // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 }
 
+// CHECK-LABEL: define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> <i32 1>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
+// CHECK:   [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #2
+// CHECK:   [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I_I]]
 poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vmull_high_p64
   return vmull_high_p64(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
   return vreinterpretq_p128_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
   return vreinterpretq_p128_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
   return vreinterpretq_p128_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
   return vreinterpretq_p128_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
   return vreinterpretq_p128_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
   return vreinterpretq_p128_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
   return vreinterpretq_p128_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
   return vreinterpretq_p128_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_f32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
   return vreinterpretq_p128_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_f64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
   return vreinterpretq_p128_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
   return vreinterpretq_p128_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
   return vreinterpretq_p128_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
   return vreinterpretq_p128_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
   return vreinterpretq_s8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
   return vreinterpretq_s16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
   return vreinterpretq_s32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
   return vreinterpretq_s64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
   return vreinterpretq_u8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
   return vreinterpretq_u16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
   return vreinterpretq_u32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
   return vreinterpretq_u64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
   return vreinterpretq_f32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
   return vreinterpretq_f64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
   return vreinterpretq_p8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
   return vreinterpretq_p16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
   return vreinterpretq_p64_p128(a);
 }
Index: test/CodeGen/aarch64-poly64.c
===================================================================
--- test/CodeGen/aarch64-poly64.c
+++ test/CodeGen/aarch64-poly64.c
@@ -1,299 +1,634 @@
-// FIXME: This is a front-end test that depends on LLVM optimizations (-O3). 
-// It should be split into separate files for front/middle/back-end testing.
-
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
 
 // Test new aarch64 intrinsics with poly64
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vceq_p64
   return vceq_p64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vceqq_p64
   return vceqq_p64(a, b);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vtst_p64
   return vtst_p64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtstq_p64
   return vtstq_p64(a, b);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
-  // CHECK-LABEL: test_vbsl_p64
   return vbsl_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
-  // CHECK-LABEL: test_vbslq_p64
   return vbslq_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 poly64_t test_vget_lane_p64(poly64x1_t v) {
-  // CHECK-LABEL: test_vget_lane_p64
   return vget_lane_p64(v, 0);
-  // CHECK: fmov  {{x[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
-  // CHECK-LABEL: test_vgetq_lane_p64
   return vgetq_lane_p64(v, 1);
-  // CHECK: {{mov|umov}}  {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
-  // CHECK-LABEL: test_vset_lane_p64
   return vset_lane_p64(a, v, 0);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
-  // CHECK-LABEL: test_vsetq_lane_p64
   return vsetq_lane_p64(a, v, 1);
-  // CHECK: ins  {{v[0-9]+}}.d[1], {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopy_lane_p64
   return vcopy_lane_p64(a, 0, b, 0);
 
-  // CHECK-ARM64: mov v0.16b, v1.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopyq_lane_p64
   return vcopyq_lane_p64(a, 1, b, 0);
-  // CHECK: zip1 v0.2d, v0.2d, v1.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vcopyq_laneq_p64
   return vcopyq_laneq_p64(a, 1, b, 1);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vcreate_p64(uint64_t a) {
-  // CHECK-LABEL: test_vcreate_p64
   return vcreate_p64(a);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vdup_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdup_n_p64
   return vdup_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vdupq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdupq_n_p64
   return vdupq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vmov_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmov_n_p64
   return vmov_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vmovq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmovq_n_p64
   return vmovq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdup_lane_p64
   return vdup_lane_p64(vec, 0);
-  // CHECK: ret
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdupq_lane_p64
   return vdupq_lane_p64(vec, 0);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
-  // CHECK-LABEL: test_vdupq_laneq_p64
   return vdupq_laneq_p64(vec, 1);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64
   return vcombine_p64(low, high);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1_p64
   return vld1_p64(ptr);
-  // CHECK-ARM64: ldr {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1q_p64
   return vld1q_p64(ptr);
-  // CHECK-ARM64: ldr {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
-  // CHECK-LABEL: test_vst1_p64
   return vst1_p64(ptr, val);
-  // CHECK-ARM64: str {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
-  // CHECK-LABEL: test_vst1q_p64
   return vst1q_p64(ptr, val);
-  // CHECK-ARM64: str {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2_p64
   return vld2_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2q_p64
   return vld2q_p64(ptr);
-  // CHECK: ld2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3_p64
   return vld3_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3q_p64
   return vld3q_p64(ptr);
-  // CHECK: ld3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4_p64
   return vld4_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4q_p64
   return vld4q_p64(ptr);
-  // CHECK: ld4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
-  // CHECK-LABEL: test_vst2_p64
   return vst2_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
-  // CHECK-LABEL: test_vst2q_p64
   return vst2q_p64(ptr, val);
-  // CHECK:  st2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]5, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
-  // CHECK-LABEL: test_vst3_p64
   return vst3_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]5, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
-  // CHECK-LABEL: test_vst3q_p64
   return vst3q_p64(ptr, val);
-  // CHECK:  st3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]5, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]6, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]7, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
-  // CHECK-LABEL: test_vst4_p64
   return vst4_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]5, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]6, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]7, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
-  // CHECK-LABEL: test_vst4q_p64
   return vst4q_p64(ptr, val);
-  // CHECK:  st4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vext_p64
   return vext_u64(a, b, 0);
 
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vextq_p64
   return vextq_p64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{#0x8|#8}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_p64
   return vzip1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_p64
   return vzip2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_p64
   return vuzp1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_p64
   return vuzp2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_p64
   return vtrn1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_p64
   return vtrn2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 33)
+// CHECK:   ret <1 x i64> [[VSRI_N]]2
 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vsri_n_p64
   return vsri_n_p64(a, b, 33);
-  // CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #33
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 64)
+// CHECK:   ret <2 x i64> [[VSRI_N]]2
 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_p64
   return vsriq_n_p64(a, b, 64);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #64
 }
 
Index: test/CodeGen/arm-bitfield-alignment.c
===================================================================
--- test/CodeGen/arm-bitfield-alignment.c
+++ test/CodeGen/arm-bitfield-alignment.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - %s | FileCheck %s
 
 extern struct T {
   int b0 : 8;
Index: test/CodeGen/arm-crc32.c
===================================================================
--- test/CodeGen/arm-crc32.c
+++ test/CodeGen/arm-crc32.c
@@ -1,6 +1,5 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 int crc32b(int a, char b)
 {
@@ -48,7 +47,7 @@
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32w(i32 [[T3]], i32 [[T2]])
 }
 
@@ -58,6 +57,6 @@
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32cw(i32 [[T3]], i32 [[T2]])
 }
Index: test/CodeGen/arm-neon-directed-rounding.c
===================================================================
--- test/CodeGen/arm-neon-directed-rounding.c
+++ test/CodeGen/arm-neon-directed-rounding.c
@@ -1,75 +1,135 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[VRNDA_V_I]]) #2
+// CHECK:   [[VRNDA_V2_I:%.*]] = bitcast <2 x float> [[VRNDA_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnda_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a)
   return vrnda_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[VRNDAQ_V_I]]) #2
+// CHECK:   [[VRNDAQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDAQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndaq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a)
   return vrndaq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[VRNDM_V_I]]) #2
+// CHECK:   [[VRNDM_V2_I:%.*]] = bitcast <2 x float> [[VRNDM_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a)
   return vrndm_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[VRNDMQ_V_I]]) #2
+// CHECK:   [[VRNDMQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDMQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a)
   return vrndmq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[VRNDN_V_I]]) #2
+// CHECK:   [[VRNDN_V2_I:%.*]] = bitcast <2 x float> [[VRNDN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndn_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a)
   return vrndn_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[VRNDNQ_V_I]]) #2
+// CHECK:   [[VRNDNQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDNQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndnq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a)
   return vrndnq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[VRNDP_V_I]]) #2
+// CHECK:   [[VRNDP_V2_I:%.*]] = bitcast <2 x float> [[VRNDP_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndp_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a)
   return vrndp_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[VRNDPQ_V_I]]) #2
+// CHECK:   [[VRNDPQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDPQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndpq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a)
   return vrndpq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[VRNDX_V_I]]) #2
+// CHECK:   [[VRNDX_V2_I:%.*]] = bitcast <2 x float> [[VRNDX_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndx_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a)
   return vrndx_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[VRNDXQ_V_I]]) #2
+// CHECK:   [[VRNDXQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDXQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndxq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a)
   return vrndxq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[VRND_V_I]]) #2
+// CHECK:   [[VRND_V2_I:%.*]] = bitcast <2 x float> [[VRND_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnd_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a)
   return vrnd_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[VRNDQ_V_I]]) #2
+// CHECK:   [[VRNDQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a)
   return vrndq_f32(a);
 }
Index: test/CodeGen/arm-neon-fma.c
===================================================================
--- test/CodeGen/arm-neon-fma.c
+++ test/CodeGen/arm-neon-fma.c
@@ -1,19 +1,34 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-none-linux-gnueabihf \
 // RUN:   -target-abi aapcs \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -mfloat-abi hard \
 // RUN:   -ffreestanding \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_fma_order(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %accum to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %lhs to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %rhs to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
   return vfma_f32(accum, lhs, rhs);
-// CHECK: call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum)
 }
 
+// CHECK-LABEL: define <4 x float> @test_fmaq_order(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %accum to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %lhs to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %rhs to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
-// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum)
 }
Index: test/CodeGen/arm-neon-numeric-maxmin.c
===================================================================
--- test/CodeGen/arm-neon-numeric-maxmin.c
+++ test/CodeGen/arm-neon-numeric-maxmin.c
@@ -1,27 +1,55 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[VMAXNM_V_I]], <2 x float> [[VMAXNM_V1_I]]) #2
+// CHECK:   [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmaxnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b)
   return vmaxnm_f32(a, b);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[VMAXNMQ_V_I]], <4 x float> [[VMAXNMQ_V1_I]]) #2
+// CHECK:   [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmaxnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b)
   return vmaxnmq_f32(a, b);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[VMINNM_V_I]], <2 x float> [[VMINNM_V1_I]]) #2
+// CHECK:   [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vminnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b)
   return vminnm_f32(a, b);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[VMINNMQ_V_I]], <4 x float> [[VMINNMQ_V1_I]]) #2
+// CHECK:   [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vminnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b)
   return vminnmq_f32(a, b);
 }
Index: test/CodeGen/arm-neon-shifts.c
===================================================================
--- test/CodeGen/arm-neon-shifts.c
+++ test/CodeGen/arm-neon-shifts.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
@@ -27,19 +27,20 @@
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra
   // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_u8(a, b, 5);
 }
 
 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_smax
   // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_s8(a, b, 8);
 }
 
 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
   return vsra_n_u8(a, b, 8);
 }
Index: test/CodeGen/arm-neon-vcvtX.c
===================================================================
--- test/CodeGen/arm-neon-vcvtX.c
+++ test/CodeGen/arm-neon-vcvtX.c
@@ -1,99 +1,147 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[VCVTA_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA_S32_V1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
   return vcvta_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[VCVTA_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA_U32_V1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
   return vcvta_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTAQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[VCVTAQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
   return vcvtaq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTAQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[VCVTAQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
   return vcvtaq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[VCVTN_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN_S32_V1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
   return vcvtn_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN_U32_V1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
   return vcvtn_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTNQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[VCVTNQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
   return vcvtnq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTNQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[VCVTNQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
   return vcvtnq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[VCVTP_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP_S32_V1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
   return vcvtp_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP_U32_V1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
   return vcvtp_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTPQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[VCVTPQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
   return vcvtpq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTPQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[VCVTPQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
   return vcvtpq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[VCVTM_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM_S32_V1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
   return vcvtm_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM_U32_V1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
   return vcvtm_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTMQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[VCVTMQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
   return vcvtmq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTMQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[VCVTMQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
   return vcvtmq_u32_f32(a);
 }
Index: test/CodeGen/arm-neon-vget.c
===================================================================
--- test/CodeGen/arm-neon-vget.c
+++ test/CodeGen/arm-neon-vget.c
@@ -1,124 +1,123 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-abi apcs-gnu \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -mfloat-abi soft \
 // RUN:   -target-feature +soft-float-abi \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
 // Check that the vget_low/vget_high intrinsics generate a single shuffle
 // without any bitcasting.
 int8x8_t low_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_s8(a);
 }
 
 uint8x8_t low_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_u8(a);
 }
 
 int16x4_t low_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_s16(a);
 }
 
 uint16x4_t low_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_u16(a);
 }
 
 int32x2_t low_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_s32(a);
 }
 
 uint32x2_t low_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_u32(a);
 }
 
 int64x1_t low_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   return vget_low_s64(a);
 }
 
 uint64x1_t low_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   return vget_low_u64(a);
 }
 
 poly8x8_t low_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_p8(a);
 }
 
 poly16x4_t low_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_p16(a);
 }
 
 float32x2_t low_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_f32(a);
 }
 
 
 int8x8_t high_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_s8(a);
 }
 
 uint8x8_t high_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_u8(a);
 }
 
 int16x4_t high_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_s16(a);
 }
 
 uint16x4_t high_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_u16(a);
 }
 
 int32x2_t high_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_s32(a);
 }
 
 uint32x2_t high_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_u32(a);
 }
 
 int64x1_t high_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   return vget_high_s64(a);
 }
 
 uint64x1_t high_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   return vget_high_u64(a);
 }
 
 poly8x8_t high_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_p8(a);
 }
 
 poly16x4_t high_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_p16(a);
 }
 
 float32x2_t high_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_f32(a);
 }
 
Index: test/CodeGen/arm64-be-bitfield.c
===================================================================
--- test/CodeGen/arm64-be-bitfield.c
+++ test/CodeGen/arm64-be-bitfield.c
@@ -1,6 +1,4 @@
-// REQUIRES: aarch64-registered-target
 // RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o - %s | FileCheck --check-prefix IR %s
-// RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -S -O1 -o - %s | FileCheck --check-prefix ARM %s
 
 struct bt3 { signed b2:10; signed b3:10; } b16;
 
@@ -10,6 +8,5 @@
 // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
 // IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
-// ARM: asr x0, x0, #54
   return bp11.b2;
 }
Index: test/CodeGen/arm64-crc32.c
===================================================================
--- test/CodeGen/arm64-crc32.c
+++ test/CodeGen/arm64-crc32.c
@@ -1,6 +1,6 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 int crc32b(int a, char b)
 {
Index: test/CodeGen/arm64-lanes.c
===================================================================
--- test/CodeGen/arm64-lanes.c
+++ test/CodeGen/arm64-lanes.c
@@ -1,74 +1,127 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-BE
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix CHECK-BE
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vdupb_lane_s8
 int8_t test_vdupb_lane_s8(int8x8_t src) {
   return vdupb_lane_s8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_s8
   // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_s8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }
 
-// CHECK-LABEL: @test_vdupb_lane_u8
 uint8_t test_vdupb_lane_u8(uint8x8_t src) {
   return vdupb_lane_u8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_u8
   // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_u8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }
 
-// CHECK-LABEL: @test_vduph_lane_s16
 int16_t test_vduph_lane_s16(int16x4_t src) {
   return vduph_lane_s16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_s16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_s16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }
 
-// CHECK-LABEL: @test_vduph_lane_u16
 uint16_t test_vduph_lane_u16(uint16x4_t src) {
   return vduph_lane_u16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_u16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_u16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }
 
-// CHECK-LABEL: @test_vdups_lane_s32
 int32_t test_vdups_lane_s32(int32x2_t src) {
   return vdups_lane_s32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_s32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_s32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdups_lane_u32
 uint32_t test_vdups_lane_u32(uint32x2_t src) {
   return vdups_lane_u32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_u32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_u32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdups_lane_f32
 float32_t test_vdups_lane_f32(float32x2_t src) {
   return vdups_lane_f32(src, 0);
-  // CHECK: extractelement <2 x float> %src, i32 0
-  // CHECK-BE: extractelement <2 x float> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_f32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK: extractelement <2 x float> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_f32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x float> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x float> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK-BE: extractelement <2 x float> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_s64
 int64_t test_vdupd_lane_s64(int64x1_t src) {
   return vdupd_lane_s64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_s64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_s64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_u64
 uint64_t test_vdupd_lane_u64(uint64x1_t src) {
   return vdupd_lane_u64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_u64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_u64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_f64
 float64_t test_vdupd_lane_f64(float64x1_t src) {
   return vdupd_lane_f64(src, 0);
-  // CHECK: extractelement <1 x double> %src, i32 0
-  // CHECK-BE: extractelement <1 x double> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_f64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x double>
+  // CHECK: extractelement <1 x double> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_f64
+  // CHECK-BE: extractelement <1 x double> {{.*}}, i32 0
 }
Index: test/CodeGen/arm64_vcopy.c
===================================================================
--- test/CodeGen/arm64_vcopy.c
+++ test/CodeGen/arm64_vcopy.c
@@ -1,69 +1,121 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 
 // Test ARM64 SIMD copy vector element to vector element: vcopyq_lane*
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vcopyq_laneq_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s8
   return vcopyq_laneq_s8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_u8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vcopyq_laneq_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u8
   return vcopyq_laneq_u8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_s16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vcopyq_laneq_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s16
   return vcopyq_laneq_s16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
 
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_u16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vcopyq_laneq_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u16
   return vcopyq_laneq_u16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
 
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_s32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vcopyq_laneq_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s32
   return vcopyq_laneq_s32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_u32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vcopyq_laneq_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u32
   return vcopyq_laneq_u32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_s64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vcopyq_laneq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s64
   return vcopyq_laneq_s64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_u64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vcopyq_laneq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u64
   return vcopyq_laneq_u64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcopyq_laneq_f32(<4 x float> %a1, <4 x float> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP3]], float [[VGETQ_LANE]], i32 0
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vcopyq_laneq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f32
   return vcopyq_laneq_f32(a1, 0, a2, 3);
-  // CHECK: shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcopyq_laneq_f64(<2 x double> %a1, <2 x double> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x double> [[TMP3]], double [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x double> [[VSET_LANE]]
 float64x2_t test_vcopyq_laneq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f64
   return vcopyq_laneq_f64(a1, 0, a2, 1);
-  // CHECK: shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 3, i32 1>
 }
 
Index: test/CodeGen/arm64_vcreate.c
===================================================================
--- test/CodeGen/arm64_vcreate.c
+++ test/CodeGen/arm64_vcreate.c
@@ -1,7 +1,6 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 // Test ARM64 SIMD vcreate intrinsics
 
-/*#include <arm_neon.h>*/
 #include <arm_neon.h>
 
 float32x2_t test_vcreate_f32(uint64_t a1) {
@@ -10,14 +9,3 @@
   // CHECK: bitcast {{.*}} to <2 x float>
   // CHECK-NEXT: ret
 }
-
-// FIXME enable when scalar_to_vector in backend is fixed.  Also, change
-// CHECK@ to CHECK<colon> and CHECK-NEXT@ to CHECK-NEXT<colon>
-/*
-float64x1_t test_vcreate_f64(uint64_t a1) {
-  // CHECK@ test_vcreate_f64
-  return vcreate_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-*/
Index: test/CodeGen/arm64_vdupq_n_f64.c
===================================================================
--- test/CodeGen/arm64_vdupq_n_f64.c
+++ test/CodeGen/arm64_vdupq_n_f64.c
@@ -1,88 +1,78 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | \
-// RUN:   FileCheck -check-prefix=CHECK-IR %s
-// REQUIRES: aarch64-registered-target
-
-/// Test vdupq_n_f64 and vmovq_nf64 ARM64 intrinsics
-// <rdar://problem/11778405> ARM64: vdupq_n_f64 and vdupq_lane_f64 intrinsics
-// missing
-
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -fallow-half-arguments-and-returns -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
 // vdupq_n_f64 -> dup.2d v0, v0[0]
 //
-float64x2_t test_vdupq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vdupq_n_f64(float64_t w) {
     return vdupq_n_f64(w);
-  // CHECK-LABEL: test_vdupq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // might as well test this while we're here
 // vdupq_n_f32 -> dup.4s v0, v0[0]
-float32x4_t test_vdupq_n_f32(float32_t w)
-{
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
+float32x4_t test_vdupq_n_f32(float32_t w) {
     return vdupq_n_f32(w);
-  // CHECK-LABEL: test_vdupq_n_f32:
-  // CHECK: dup.4s v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // vdupq_lane_f64 -> dup.2d v0, v0[0]
 // this was in <rdar://problem/11778405>, but had already been implemented,
 // test anyway
-float64x2_t test_vdupq_lane_f64(float64x1_t V)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[SHUFFLE]]
+float64x2_t test_vdupq_lane_f64(float64x1_t V) {
     return vdupq_lane_f64(V, 0);
-  // CHECK-LABEL: test_vdupq_lane_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // vmovq_n_f64 -> dup Vd.2d,X0
 // this wasn't in <rdar://problem/11778405>, but it was between the vdups
-float64x2_t test_vmovq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vmovq_n_f64(float64_t w) {
   return vmovq_n_f64(w);
-  // CHECK-LABEL: test_vmovq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
-float16x4_t test_vmov_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmov_n_f16
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT]]3
+float16x4_t test_vmov_n_f16(float16_t *a1) {
   return vmov_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
 }
 
-// Disable until scalar problem in backend is fixed. Change CHECK-IR@ to
-// CHECK-IR<colon>
 /*
-float64x1_t test_vmov_n_f64(float64_t a1)
-{
-  // CHECK-IR@ test_vmov_n_f64
+float64x1_t test_vmov_n_f64(float64_t a1) {
   return vmov_n_f64(a1);
-  // CHECK-IR@ insertelement {{.*}} i32 0{{ *$}}
 }
 */
 
-float16x8_t test_vmovq_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmovq_n_f16
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT]]7
+float16x8_t test_vmovq_n_f16(float16_t *a1) {
   return vmovq_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 4{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 5{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 6{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 7{{ *$}}
 }
 
Index: test/CodeGen/arm_neon_intrinsics.c
===================================================================
--- test/CodeGen/arm_neon_intrinsics.c
+++ test/CodeGen/arm_neon_intrinsics.c
@@ -1,1611 +1,2446 @@
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
-// RUN: %clang_cc1 -triple armv8-linux-gnu \
-// RUN:  -target-cpu cortex-a57 -mfloat-abi soft -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-A57
+// RUN:  -target-cpu swift -ffreestanding -emit-llvm -S -o - %s\
+// RUN:  opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: long_tests
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vaba_s8
-// CHECK: vaba.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vaba_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s16
-// CHECK: vaba.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vaba_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s32
-// CHECK: vaba.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vaba_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u8
-// CHECK: vaba.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vaba_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u16
-// CHECK: vaba.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vaba_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u32
-// CHECK: vaba.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vaba_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s8
-// CHECK: vaba.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vabaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s16
-// CHECK: vaba.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vabaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s32
-// CHECK: vaba.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vabaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u8
-// CHECK: vaba.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vabaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u16
-// CHECK: vaba.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vabaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u32
-// CHECK: vaba.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vabaq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vabal_s8
-// CHECK: vabal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s16
-// CHECK: vabal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s32
-// CHECK: vabal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u8
-// CHECK: vabal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u16
-// CHECK: vabal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u32
-// CHECK: vabal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vabd_s8
-// CHECK: vabd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
   return vabd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s16
-// CHECK: vabd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
   return vabd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s32
-// CHECK: vabd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
   return vabd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u8
-// CHECK: vabd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
   return vabd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u16
-// CHECK: vabd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
   return vabd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u32
-// CHECK: vabd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
   return vabd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_f32
-// CHECK: vabd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4
+// CHECK:   ret <2 x float> [[VABD2_I]]
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
   return vabd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s8
-// CHECK: vabd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
   return vabdq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s16
-// CHECK: vabd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
   return vabdq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s32
-// CHECK: vabd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
   return vabdq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u8
-// CHECK: vabd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
   return vabdq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u16
-// CHECK: vabd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
   return vabdq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u32
-// CHECK: vabd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
   return vabdq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_f32
-// CHECK: vabd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4
+// CHECK:   ret <4 x float> [[VABD2_I]]
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
   return vabdq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vabdl_s8
-// CHECK: vabdl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s16
-// CHECK: vabdl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s32
-// CHECK: vabdl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u8
-// CHECK: vabdl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u16
-// CHECK: vabdl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u32
-// CHECK: vabdl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vabs_s8
-// CHECK: vabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: test_vabs_s16
-// CHECK: vabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #4
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: test_vabs_s32
-// CHECK: vabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #4
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: test_vabs_f32
-// CHECK: vabs.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: test_vabsq_s8
-// CHECK: vabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vabsq_s16
-// CHECK: vabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #4
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vabsq_s32
-// CHECK: vabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #4
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: test_vabsq_f32
-// CHECK: vabs.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
 
-// CHECK-LABEL: test_vadd_s8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
   return vadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
   return vadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
   return vadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
   return vadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vadd_f32
-// CHECK: vadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
   return vadd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
   return vadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
   return vadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
   return vadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
   return vadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
   return vaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
   return vaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
   return vaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_f32
-// CHECK: vadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
   return vaddq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vaddq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddhn_s16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddl_s8
-// CHECK: vaddl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s16
-// CHECK: vaddl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s32
-// CHECK: vaddl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u8
-// CHECK: vaddl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u16
-// CHECK: vaddl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u32
-// CHECK: vaddl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddw_s8
-// CHECK: vaddw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s16
-// CHECK: vaddw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s32
-// CHECK: vaddw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u8
-// CHECK: vaddw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u16
-// CHECK: vaddw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u32
-// CHECK: vaddw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vand_s8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
   return vand_s8(a, b);
 }
 
-// CHECK-LABEL: test_vand_s16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
   return vand_s16(a, b);
 }
 
-// CHECK-LABEL: test_vand_s32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
   return vand_s32(a, b);
 }
 
-// CHECK-LABEL: test_vand_s64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
   return vand_s64(a, b);
 }
 
-// CHECK-LABEL: test_vand_u8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
   return vand_u8(a, b);
 }
 
-// CHECK-LABEL: test_vand_u16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
   return vand_u16(a, b);
 }
 
-// CHECK-LABEL: test_vand_u32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
   return vand_u32(a, b);
 }
 
-// CHECK-LABEL: test_vand_u64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
   return vand_u64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
   return vandq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
   return vandq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
   return vandq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
   return vandq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
   return vandq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
   return vandq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
   return vandq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vbic_s8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   return vbic_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   return vbic_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   return vbic_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   return vbic_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   return vbic_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   return vbic_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   return vbic_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   return vbic_u64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   return vbicq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   return vbicq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   return vbicq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   return vbicq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   return vbicq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   return vbicq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   return vbicq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   return vbicq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vbsl_s8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   return vbsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   return vbsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   return vbsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   return vbsl_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vbsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vbsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vbsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   return vbsl_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_f32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP4]]
 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   return vbsl_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   return vbsl_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   return vbsl_p16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   return vbslq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   return vbslq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   return vbslq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   return vbslq_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vbslq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vbslq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vbslq_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   return vbslq_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_f32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP4]]
 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   return vbslq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   return vbslq_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   return vbslq_p16(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vcage_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   return vcage_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcageq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   return vcageq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcagt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   return vcagt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcagtq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   return vcagtq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcale_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   return vcale_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaleq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   return vcaleq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcalt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   return vcalt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaltq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   return vcaltq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vceq_s8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   return vceq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   return vceq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   return vceq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_f32
-// CHECK: vceq.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   return vceq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   return vceq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   return vceq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   return vceq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_p8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   return vceq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   return vceqq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   return vceqq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   return vceqq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_f32
-// CHECK: vceq.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   return vceqq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   return vceqq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   return vceqq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   return vceqq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_p8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   return vceqq_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vcge_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   return vcge_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   return vcge_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   return vcge_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   return vcge_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   return vcge_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   return vcge_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   return vcge_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   return vcgeq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   return vcgeq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   return vcgeq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   return vcgeq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgeq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgeq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgeq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcgt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   return vcgt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   return vcgt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   return vcgt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   return vcgt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   return vcgt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   return vcgt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   return vcgt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   return vcgtq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   return vcgtq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   return vcgtq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   return vcgtq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgtq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgtq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgtq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcle_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   return vcle_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   return vcle_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   return vcle_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   return vcle_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   return vcle_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   return vcle_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   return vcle_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   return vcleq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   return vcleq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   return vcleq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   return vcleq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   return vcleq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   return vcleq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   return vcleq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcls_s8
-// CHECK: vcls.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: test_vcls_s16
-// CHECK: vcls.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: test_vcls_s32
-// CHECK: vcls.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: test_vclsq_s8
-// CHECK: vcls.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: test_vclsq_s16
-// CHECK: vcls.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: test_vclsq_s32
-// CHECK: vcls.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vclt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   return vclt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   return vclt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   return vclt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   return vclt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   return vclt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   return vclt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   return vclt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   return vcltq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   return vcltq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   return vcltq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   return vcltq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   return vcltq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   return vcltq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   return vcltq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vclz_s8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: test_vclz_s16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: test_vclz_s32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: test_vclz_u8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: test_vclz_u16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: test_vclz_u32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: test_vclzq_s8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: test_vclzq_s16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: test_vclzq_s32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: test_vclzq_u8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: test_vclzq_u16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: test_vclzq_u32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vcnt_u8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: test_vcnt_s8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: test_vcnt_p8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: test_vcntq_u8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: test_vcntq_s8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: test_vcntq_p8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
 
-// CHECK-LABEL: test_vcombine_s8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   return vcombine_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   return vcombine_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   return vcombine_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   return vcombine_s64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   return vcombine_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   return vcombine_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   return vcombine_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   return vcombine_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   return vcombine_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   return vcombine_u64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   return vcombine_p8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   return vcombine_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vcreate_s8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
-// CHECK-LABEL: test_vcreate_s16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vcreate_s16(uint64_t a) {
   return vclz_s16(vcreate_s16(a));
 }
 
-// CHECK-LABEL: test_vcreate_s32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcreate_s32(uint64_t a) {
   return vclz_s32(vcreate_s32(a));
 }
 
-// CHECK-LABEL: test_vcreate_f16
+// CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vcreate_f16(uint64_t a) {
   return vcreate_f16(a);
 }
 
-// CHECK-LABEL: test_vcreate_f32
+// CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vcreate_f32(uint64_t a) {
   return vcreate_f32(a);
 }
 
-// CHECK-LABEL: test_vcreate_u8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vcreate_u8(uint64_t a) {
   return vclz_s8(vcreate_u8(a));
 }
 
-// CHECK-LABEL: test_vcreate_u16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vcreate_u16(uint64_t a) {
   return vclz_s16(vcreate_u16(a));
 }
 
-// CHECK-LABEL: test_vcreate_u32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcreate_u32(uint64_t a) {
   return vclz_s32(vcreate_u32(a));
 }
@@ -1614,10145 +2449,21932 @@
 // We have two ways of lowering that.  Either with one 'vmov d, r, r' or
 // with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
 // strict about the matching pattern if it starts causing problem.
-// CHECK-LABEL: test_vcreate_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vcreate_u64(uint64_t a) {
   uint64x1_t tmp = vcreate_u64(a);
   return vadd_u64(tmp, tmp);
 
 }
 
-// CHECK-LABEL: test_vcreate_p8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vcnt.8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcreate_p8(uint64_t a) {
   return vcnt_p8(vcreate_p8(a));
 }
 
-// CHECK-LABEL: test_vcreate_p16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
+// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP4:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP4]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vcreate_p16(uint64_t a) {
   poly16x4_t tmp = vcreate_p16(a);
   return vbsl_p16(tmp, tmp, tmp);
 }
 
-// CHECK-LABEL: test_vcreate_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vcreate_s64(uint64_t a) {
   int64x1_t tmp = vcreate_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
 
-// CHECK-LABEL: test_vcvt_f16_f32
-// CHECK: vcvt.f16.f32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvt_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_f32_f16
-// CHECK: vcvt.f32.f16
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N]]1
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvt_n_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N]]1
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N]]1
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N]]1
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N]]1
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N]]1
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N]]1
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N]]1
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
 
-// CHECK-LABEL: test_vdup_lane_u8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   return vdup_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_u16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_u32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   return vdup_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_s16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_s32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_p8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   return vdup_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_p16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_f32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE]]
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   return vdupq_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   return vdupq_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   return vdupq_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_f32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x float> [[SHUFFLE]]
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s64
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdup_lane_u64
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
 
 
-// CHECK-LABEL: test_vdup_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vdup_n_u8(uint8_t a) {
   return vdup_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vdup_n_u16(uint16_t a) {
   return vdup_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vdup_n_u32(uint32_t a) {
   return vdup_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vdup_n_s8(int8_t a) {
   return vdup_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vdup_n_s16(int16_t a) {
   return vdup_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vdup_n_s32(int32_t a) {
   return vdup_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vdup_n_p8(poly8_t a) {
   return vdup_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vdup_n_p16(poly16_t a) {
   return vdup_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT]]3
 float16x4_t test_vdup_n_f16(float16_t *a) {
   return vdup_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdup_n_f32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vdup_n_f32(float32_t a) {
   return vdup_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vdupq_n_u8(uint8_t a) {
   return vdupq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vdupq_n_u16(uint16_t a) {
   return vdupq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vdupq_n_u32(uint32_t a) {
   return vdupq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vdupq_n_s8(int8_t a) {
   return vdupq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vdupq_n_s16(int16_t a) {
   return vdupq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vdupq_n_s32(int32_t a) {
   return vdupq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vdupq_n_p8(poly8_t a) {
   return vdupq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vdupq_n_p16(poly16_t a) {
   return vdupq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT]]7
 float16x8_t test_vdupq_n_f16(float16_t *a) {
   return vdupq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vdupq_n_f32(float32_t a) {
   return vdupq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s64
-// CHECK: vmov
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vdup_n_s64(int64_t a) {
   int64x1_t tmp = vdup_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdup_n_u64
-// CHECK: vmov
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vdup_n_u64(uint64_t a) {
   int64x1_t tmp = vdup_n_u64(a);
   return vadd_s64(tmp, tmp);
 
 }
 
-// CHECK-LABEL: test_vdupq_n_s64
-// CHECK: vmov
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vdupq_n_s64(int64_t a) {
   int64x2_t tmp = vdupq_n_s64(a);
   return vaddq_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdupq_n_u64
-// CHECK: vmov
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vdupq_n_u64(uint64_t a) {
   int64x2_t tmp = vdupq_n_u64(a);
   return vaddq_u64(tmp, tmp);
 }
 
 
-// CHECK-LABEL: test_veor_s8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   return veor_s8(a, b);
 }
 
-// CHECK-LABEL: test_veor_s16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   return veor_s16(a, b);
 }
 
-// CHECK-LABEL: test_veor_s32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   return veor_s32(a, b);
 }
 
-// CHECK-LABEL: test_veor_s64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   return veor_s64(a, b);
 }
 
-// CHECK-LABEL: test_veor_u8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   return veor_u8(a, b);
 }
 
-// CHECK-LABEL: test_veor_u16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   return veor_u16(a, b);
 }
 
-// CHECK-LABEL: test_veor_u32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   return veor_u32(a, b);
 }
 
-// CHECK-LABEL: test_veor_u64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   return veor_u64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   return veorq_s8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   return veorq_s32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   return veorq_s64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   return veorq_u8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   return veorq_u16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   return veorq_u32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   return veorq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vext_s8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_u8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_p8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_s16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_u16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_p16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_s32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_u32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_s64
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_u64
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_f32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_s8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_u8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_p8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_s16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_u16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_p16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_s32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_u32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_f32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vfma_f32
-// CHECK: vfma.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_f32
-// CHECK: vfma.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vget_high_s8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
   return vget_high_s8(a);
 }
 
-// CHECK-LABEL: test_vget_high_s16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
   return vget_high_s16(a);
 }
 
-// CHECK-LABEL: test_vget_high_s32
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
   return vget_high_s32(a);
 }
 
-// CHECK-LABEL: test_vget_high_s64
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
   return vget_high_s64(a);
 }
 
-// CHECK-LABEL: test_vget_high_f16
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
   return vget_high_f16(a);
 }
 
-// CHECK-LABEL: test_vget_high_f32
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
   return vget_high_f32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   return vget_high_u8(a);
 }
 
-// CHECK-LABEL: test_vget_high_u16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   return vget_high_u16(a);
 }
 
-// CHECK-LABEL: test_vget_high_u32
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   return vget_high_u32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u64
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   return vget_high_u64(a);
 }
 
-// CHECK-LABEL: test_vget_high_p8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   return vget_high_p8(a);
 }
 
-// CHECK-LABEL: test_vget_high_p16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   return vget_high_p16(a);
 }
 
 
-// CHECK-LABEL: test_vget_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
   return vget_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
   return vget_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
   return vget_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
   return vget_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
   return vget_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
   return vget_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
   return vget_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
   return vget_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
   return vget_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
   return vget_lane_f16(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   return vgetq_lane_u8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   return vgetq_lane_u16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   return vgetq_lane_u32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
   return vgetq_lane_s8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
   return vgetq_lane_s16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
   return vgetq_lane_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   return vgetq_lane_p8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   return vgetq_lane_p16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
   return vgetq_lane_f32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
   return vgetq_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s64
 // The optimizer is able to remove all moves now.
+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
   return vget_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vget_lane_u64
 // The optimizer is able to remove all moves now.
+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
   return vget_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
   return vgetq_lane_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   return vgetq_lane_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vget_low_s8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
   return vget_low_s8(a);
 }
 
-// CHECK-LABEL: test_vget_low_s16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
   return vget_low_s16(a);
 }
 
-// CHECK-LABEL: test_vget_low_s32
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
   return vget_low_s32(a);
 }
 
-// CHECK-LABEL: test_vget_low_s64
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
   return vget_low_s64(a);
 }
 
-// CHECK-LABEL: test_vget_low_f16
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
   return vget_low_f16(a);
 }
 
-// CHECK-LABEL: test_vget_low_f32
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
   return vget_low_f32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   return vget_low_u8(a);
 }
 
-// CHECK-LABEL: test_vget_low_u16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   return vget_low_u16(a);
 }
 
-// CHECK-LABEL: test_vget_low_u32
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   return vget_low_u32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u64
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   return vget_low_u64(a);
 }
 
-// CHECK-LABEL: test_vget_low_p8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   return vget_low_p8(a);
 }
 
-// CHECK-LABEL: test_vget_low_p16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   return vget_low_p16(a);
 }
 
 
-// CHECK-LABEL: test_vhadd_s8
-// CHECK: vhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   return vhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s16
-// CHECK: vhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   return vhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s32
-// CHECK: vhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   return vhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u8
-// CHECK: vhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u16
-// CHECK: vhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u32
-// CHECK: vhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s8
-// CHECK: vhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   return vhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s16
-// CHECK: vhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   return vhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s32
-// CHECK: vhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   return vhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u8
-// CHECK: vhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u16
-// CHECK: vhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u32
-// CHECK: vhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vhaddq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vhsub_s8
-// CHECK: vhsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   return vhsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s16
-// CHECK: vhsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   return vhsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s32
-// CHECK: vhsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   return vhsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u8
-// CHECK: vhsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   return vhsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u16
-// CHECK: vhsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   return vhsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u32
-// CHECK: vhsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   return vhsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s8
-// CHECK: vhsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   return vhsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s16
-// CHECK: vhsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   return vhsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s32
-// CHECK: vhsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   return vhsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u8
-// CHECK: vhsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vhsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u16
-// CHECK: vhsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vhsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u32
-// CHECK: vhsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vhsubq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vld1q_u8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_u16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const * a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_u32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const * a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_u64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const * a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_s8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const * a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_s16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const * a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_s32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const * a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_s64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const * a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_f16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP3]]
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_f32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const * a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_p8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_p16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const * a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_u8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const * a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_u16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const * a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_u32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const * a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_u64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const * a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_s8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const * a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_s16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const * a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_s32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const * a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_s64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const * a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_f16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP3]]
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_f32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const * a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_p8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const * a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_p16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const * a) {
   return vld1_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld1q_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t const * a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t const * a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t const * a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t const * a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   return vld1_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld1q_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld2q_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: test_vld2q_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: test_vld2q_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: test_vld2q_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: test_vld2q_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: test_vld2q_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: test_vld2q_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: test_vld2q_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: test_vld2q_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: test_vld2q_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: test_vld2_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const * a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const * a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const * a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const * a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const * a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld2_dup_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
   return vld2_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
   return vld2_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
   return vld2_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
   return vld2_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
   return vld2_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
   return vld2_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
   return vld2_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
   return vld2_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   return vld2_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
   return vld2_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
   return vld2_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
   return vld2_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld2q_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]3
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]3
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]3
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]3
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP1]]3
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]3
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]3
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]3
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]3
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]3
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]3
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP1]]3
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]3
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]3
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld3q_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: test_vld3q_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: test_vld3q_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: test_vld3q_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: test_vld3q_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: test_vld3q_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: test_vld3q_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: test_vld3q_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: test_vld3q_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: test_vld3q_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: test_vld3_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const * a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const * a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const * a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const * a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const * a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld3_dup_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
   return vld3_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
   return vld3_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
   return vld3_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
   return vld3_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
   return vld3_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
   return vld3_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
   return vld3_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
   return vld3_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   return vld3_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
   return vld3_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
   return vld3_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
   return vld3_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld3q_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP1]]6
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP1]]6
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP1]]6
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP1]]6
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP1]]6
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP1]]6
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP1]]6
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP1]]6
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP1]]6
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP1]]6
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP1]]6
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP1]]6
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP1]]6
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP1]]6
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld4q_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: test_vld4q_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: test_vld4q_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: test_vld4q_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: test_vld4q_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: test_vld4q_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: test_vld4q_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: test_vld4q_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: test_vld4q_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: test_vld4q_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: test_vld4_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const * a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const * a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const * a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const * a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const * a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld4_dup_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
   return vld4_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
   return vld4_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
   return vld4_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
   return vld4_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
   return vld4_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
   return vld4_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
   return vld4_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
   return vld4_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   return vld4_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
   return vld4_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
   return vld4_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
   return vld4_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld4q_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP1]]9
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP1]]9
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP1]]9
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP1]]9
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP1]]9
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, <4 x float> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP1]]9
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP1]]9
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP1]]0
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP1]]9
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP1]]9
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP1]]0
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP1]]9
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP1]]9
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP1]]9
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, <2 x float> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP1]]9
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP1]]0
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP1]]9
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vmax_s8
-// CHECK: vmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s16
-// CHECK: vmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s32
-// CHECK: vmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u8
-// CHECK: vmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u16
-// CHECK: vmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u32
-// CHECK: vmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_f32
-// CHECK: vmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VMAX2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s8
-// CHECK: vmax.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s16
-// CHECK: vmax.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s32
-// CHECK: vmax.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u8
-// CHECK: vmax.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u16
-// CHECK: vmax.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u32
-// CHECK: vmax.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_f32
-// CHECK: vmax.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x float> [[VMAX2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmin_s8
-// CHECK: vmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s16
-// CHECK: vmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s32
-// CHECK: vmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u8
-// CHECK: vmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u16
-// CHECK: vmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u32
-// CHECK: vmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_f32
-// CHECK: vmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VMIN2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s8
-// CHECK: vmin.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s16
-// CHECK: vmin.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s32
-// CHECK: vmin.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u8
-// CHECK: vmin.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u16
-// CHECK: vmin.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u32
-// CHECK: vmin.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_f32
-// CHECK: vmin.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x float> [[VMIN2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmla_s8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmla_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmla_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlaq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlal_s8
-// CHECK: vmlal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u8
-// CHECK: vmlal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlal_lane_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmlal_n_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmla_lane_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmla_n_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, 
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmls_s8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmls_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmls_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlsq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlsq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlsq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlsl_s8
-// CHECK: vmlsl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u8
-// CHECK: vmlsl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlsl_lane_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmlsl_n_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmls_lane_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmls_n_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmovl_s8
-// CHECK: vmovl.s8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: test_vmovl_s16
-// CHECK: vmovl.s16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: test_vmovl_s32
-// CHECK: vmovl.s32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: test_vmovl_u8
-// CHECK: vmovl.u8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: test_vmovl_u16
-// CHECK: vmovl.u16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: test_vmovl_u32
-// CHECK: vmovl.u32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
 
-// CHECK-LABEL: test_vmovn_s16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vmovn_s32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vmovn_s64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vmovn_u16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vmovn_u32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vmovn_u64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
 
-// CHECK-LABEL: test_vmov_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vmov_n_u8(uint8_t a) {
   return vmov_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vmov_n_u16(uint16_t a) {
   return vmov_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vmov_n_u32(uint32_t a) {
   return vmov_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vmov_n_s8(int8_t a) {
   return vmov_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vmov_n_s16(int16_t a) {
   return vmov_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vmov_n_s32(int32_t a) {
   return vmov_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vmov_n_p8(poly8_t a) {
   return vmov_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vmov_n_p16(poly16_t a) {
   return vmov_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT]]3
 float16x4_t test_vmov_n_f16(float16_t *a) {
   return vmov_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmov_n_f32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vmov_n_f32(float32_t a) {
   return vmov_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vmovq_n_u8(uint8_t a) {
   return vmovq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vmovq_n_u16(uint16_t a) {
   return vmovq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vmovq_n_u32(uint32_t a) {
   return vmovq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vmovq_n_s8(int8_t a) {
   return vmovq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vmovq_n_s16(int16_t a) {
   return vmovq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vmovq_n_s32(int32_t a) {
   return vmovq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vmovq_n_p8(poly8_t a) {
   return vmovq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vmovq_n_p16(poly16_t a) {
   return vmovq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT]]7
 float16x8_t test_vmovq_n_f16(float16_t *a) {
   return vmovq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vmovq_n_f32(float32_t a) {
   return vmovq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vmov_n_s64(int64_t a) {
   int64x1_t tmp = vmov_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmov_n_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vmov_n_u64(uint64_t a) {
   uint64x1_t tmp = vmov_n_u64(a);
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmovq_n_s64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 int64x2_t test_vmovq_n_s64(int64_t a) {
   return vmovq_n_s64(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 uint64x2_t test_vmovq_n_u64(uint64_t a) {
   return vmovq_n_u64(a);
 }
 
 
-// CHECK-LABEL: test_vmul_s8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   return vmul_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   return vmul_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   return vmul_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   return vmul_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   return vmul_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   return vmulq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   return vmulq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   return vmulq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   return vmulq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   return vmulq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   return vmulq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   return vmulq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmull_s8
-// CHECK: vmull.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u8
-// CHECK: vmull.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_p8
-// CHECK: vmull.p8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vmull_lane_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmull_lane_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vmull_n_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmul_p8
-// CHECK: vmul.p8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   return vmul_p8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_p8
-// CHECK: vmul.p8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   return vmulq_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vmul_lane_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vmul_n_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmvn_s8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: test_vmvn_s16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: test_vmvn_s32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: test_vmvn_u8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: test_vmvn_u16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: test_vmvn_u32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: test_vmvn_p8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_p8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
 
-// CHECK-LABEL: test_vneg_s8
-// CHECK: vneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: test_vneg_s16
-// CHECK: vneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: test_vneg_s32
-// CHECK: vneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: test_vneg_f32
-// CHECK: vneg.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: test_vnegq_s8
-// CHECK: vneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vnegq_s16
-// CHECK: vneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vnegq_s32
-// CHECK: vneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: test_vnegq_f32
-// CHECK: vneg.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
 
-// CHECK-LABEL: test_vorn_s8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   return vorn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   return vorn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   return vorn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   return vorn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   return vorn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   return vorn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   return vorn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   return vorn_u64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   return vornq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   return vornq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   return vornq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   return vornq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   return vornq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   return vornq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   return vornq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   return vornq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vorr_s8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   return vorr_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   return vorr_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   return vorr_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   return vorr_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   return vorr_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   return vorr_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   return vorr_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   return vorr_u64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   return vorrq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   return vorrq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   return vorrq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   return vorrq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   return vorrq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   return vorrq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   return vorrq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   return vorrq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vpadal_s8
-// CHECK: vpadal.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s16
-// CHECK: vpadal.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s32
-// CHECK: vpadal.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u8
-// CHECK: vpadal.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u16
-// CHECK: vpadal.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u32
-// CHECK: vpadal.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s8
-// CHECK: vpadal.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s16
-// CHECK: vpadal.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s32
-// CHECK: vpadal.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u8
-// CHECK: vpadal.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u16
-// CHECK: vpadal.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u32
-// CHECK: vpadal.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpadd_s8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_f32
-// CHECK: vpadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpaddl_s8
-// CHECK: vpaddl.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s16
-// CHECK: vpaddl.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s32
-// CHECK: vpaddl.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u8
-// CHECK: vpaddl.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u16
-// CHECK: vpaddl.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u32
-// CHECK: vpaddl.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s8
-// CHECK: vpaddl.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s16
-// CHECK: vpaddl.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s32
-// CHECK: vpaddl.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u8
-// CHECK: vpaddl.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u16
-// CHECK: vpaddl.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u32
-// CHECK: vpaddl.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vpmax_s8
-// CHECK: vpmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s16
-// CHECK: vpmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s32
-// CHECK: vpmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u8
-// CHECK: vpmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u16
-// CHECK: vpmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u32
-// CHECK: vpmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_f32
-// CHECK: vpmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMAX2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpmin_s8
-// CHECK: vpmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s16
-// CHECK: vpmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s32
-// CHECK: vpmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u8
-// CHECK: vpmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u16
-// CHECK: vpmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u32
-// CHECK: vpmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_f32
-// CHECK: vpmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMIN2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqabs_s8
-// CHECK: vqabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: test_vqabs_s16
-// CHECK: vqabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: test_vqabs_s32
-// CHECK: vqabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s8
-// CHECK: vqabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s16
-// CHECK: vqabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s32
-// CHECK: vqabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vqadd_s8
-// CHECK: vqadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s16
-// CHECK: vqadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s32
-// CHECK: vqadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s64
-// CHECK: vqadd.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u8
-// CHECK: vqadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u16
-// CHECK: vqadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u32
-// CHECK: vqadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u64
-// CHECK: vqadd.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s8
-// CHECK: vqadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s16
-// CHECK: vqadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s32
-// CHECK: vqadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s64
-// CHECK: vqadd.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u8
-// CHECK: vqadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u16
-// CHECK: vqadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u32
-// CHECK: vqadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u64
-// CHECK: vqadd.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_lane_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlal_lane_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_n_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_n_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_lane_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlsl_lane_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_n_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_n_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_lane_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulh_lane_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqdmulhq_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_n_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_n_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmull_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmull_lane_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmull_lane_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmull_n_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_n_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqmovn_s16
-// CHECK: vqmovn.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s32
-// CHECK: vqmovn.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s64
-// CHECK: vqmovn.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u16
-// CHECK: vqmovn.u16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u32
-// CHECK: vqmovn.u32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u64
-// CHECK: vqmovn.u64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
 
-// CHECK-LABEL: test_vqmovun_s16
-// CHECK: vqmovun.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s32
-// CHECK: vqmovun.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s64
-// CHECK: vqmovun.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
 
-// CHECK-LABEL: test_vqneg_s8
-// CHECK: vqneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: test_vqneg_s16
-// CHECK: vqneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: test_vqneg_s32
-// CHECK: vqneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s8
-// CHECK: vqneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s16
-// CHECK: vqneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s32
-// CHECK: vqneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_lane_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulh_lane_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqrdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqrdmulhq_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_n_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_n_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrshl_s8
-// CHECK: vqrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s16
-// CHECK: vqrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s32
-// CHECK: vqrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s64
-// CHECK: vqrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u8
-// CHECK: vqrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u16
-// CHECK: vqrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u32
-// CHECK: vqrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u64
-// CHECK: vqrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s8
-// CHECK: vqrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s16
-// CHECK: vqrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s32
-// CHECK: vqrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s64
-// CHECK: vqrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
-// CHECK: vqrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u16
-// CHECK: vqrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u32
-// CHECK: vqrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u64
-// CHECK: vqrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrshrn_n_s16
-// CHECK: vqrshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N]]1
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s32
-// CHECK: vqrshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N]]1
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s64
-// CHECK: vqrshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N]]1
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u16
-// CHECK: vqrshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N]]1
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u32
-// CHECK: vqrshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N]]1
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u64
-// CHECK: vqrshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N]]1
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqrshrun_n_s16
-// CHECK: vqrshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N]]1
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s32
-// CHECK: vqrshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N]]1
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s64
-// CHECK: vqrshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N]]1
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshl_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqshlu_n_s8
-// CHECK: vqshlu.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s16
-// CHECK: vqshlu.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N]]1
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s32
-// CHECK: vqshlu.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N]]1
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: vqshlu.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N]]1
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s8
-// CHECK: vqshlu.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s16
-// CHECK: vqshlu.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N]]1
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s32
-// CHECK: vqshlu.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N]]1
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s64
-// CHECK: vqshlu.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N]]1
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshl_n_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N]]1
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N]]1
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N]]1
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N]]1
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N]]1
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N]]1
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N]]1
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N]]1
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N]]1
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N]]1
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N]]1
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N]]1
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshrn_n_s16
-// CHECK: vqshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQSHRN_N]]1
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s32
-// CHECK: vqshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQSHRN_N]]1
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s64
-// CHECK: vqshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQSHRN_N]]1
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u16
-// CHECK: vqshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQSHRN_N]]1
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u32
-// CHECK: vqshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQSHRN_N]]1
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u64
-// CHECK: vqshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQSHRN_N]]1
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshrun_n_s16
-// CHECK: vqshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N]]1
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s32
-// CHECK: vqshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N]]1
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s64
-// CHECK: vqshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N]]1
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqsub_s8
-// CHECK: vqsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s16
-// CHECK: vqsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s32
-// CHECK: vqsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s64
-// CHECK: vqsub.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u8
-// CHECK: vqsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u16
-// CHECK: vqsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u32
-// CHECK: vqsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u64
-// CHECK: vqsub.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s8
-// CHECK: vqsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s16
-// CHECK: vqsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s32
-// CHECK: vqsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s64
-// CHECK: vqsub.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u8
-// CHECK: vqsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u16
-// CHECK: vqsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u32
-// CHECK: vqsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u64
-// CHECK: vqsub.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vraddhn_s16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vrecpe_f32
-// CHECK: vrecpe.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpe_u32
-// CHECK: vrecpe.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_f32
-// CHECK: vrecpe.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_u32
-// CHECK: vrecpe.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vrecps_f32
-// CHECK: vrecps.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   return vrecps_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrecpsq_f32
-// CHECK: vrecps.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   return vrecpsq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vreinterpret_s8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
 
-// CHECK-LABEL: test_vrhadd_s8
-// CHECK: vrhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   return vrhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s16
-// CHECK: vrhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   return vrhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s32
-// CHECK: vrhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   return vrhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u8
-// CHECK: vrhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u16
-// CHECK: vrhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vrhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u32
-// CHECK: vrhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vrhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s8
-// CHECK: vrhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   return vrhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s16
-// CHECK: vrhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   return vrhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s32
-// CHECK: vrhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u8
-// CHECK: vrhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u16
-// CHECK: vrhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vrhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u32
-// CHECK: vrhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vrhaddq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vrshl_s8
-// CHECK: vrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s16
-// CHECK: vrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s32
-// CHECK: vrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s64
-// CHECK: vrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u8
-// CHECK: vrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u16
-// CHECK: vrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u32
-// CHECK: vrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u64
-// CHECK: vrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s8
-// CHECK: vrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s16
-// CHECK: vrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s32
-// CHECK: vrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s64
-// CHECK: vrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u8
-// CHECK: vrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u16
-// CHECK: vrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u32
-// CHECK: vrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u64
-// CHECK: vrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vrshrn_n_s16
-// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VRSHRN_N]]1
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_s32
-// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VRSHRN_N]]1
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_s64
-// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VRSHRN_N]]1
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u16
-// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VRSHRN_N]]1
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u32
-// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VRSHRN_N]]1
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u64
-// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VRSHRN_N]]1
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vrshr_n_s8
-// CHECK: vrshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s16
-// CHECK: vrshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VRSHR_N]]1
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s32
-// CHECK: vrshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VRSHR_N]]1
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s64
-// CHECK: vrshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N]]1
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u8
-// CHECK: vrshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   return vrshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u16
-// CHECK: vrshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VRSHR_N]]1
 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   return vrshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u32
-// CHECK: vrshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VRSHR_N]]1
 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   return vrshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u64
-// CHECK: vrshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N]]1
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s8
-// CHECK: vrshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s16
-// CHECK: vrshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VRSHR_N]]1
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s32
-// CHECK: vrshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VRSHR_N]]1
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s64
-// CHECK: vrshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VRSHR_N]]1
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u8
-// CHECK: vrshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   return vrshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u16
-// CHECK: vrshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VRSHR_N]]1
 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   return vrshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u32
-// CHECK: vrshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VRSHR_N]]1
 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   return vrshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u64
-// CHECK: vrshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VRSHR_N]]1
 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   return vrshrq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vrsqrte_f32
-// CHECK: vrsqrte.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: test_vrsqrte_u32
-// CHECK: vrsqrte.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_f32
-// CHECK: vrsqrte.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_u32
-// CHECK: vrsqrte.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vrsqrts_f32
-// CHECK: vrsqrts.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
   return vrsqrts_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrtsq_f32
-// CHECK: vrsqrts.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   return vrsqrtsq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vrsra_n_s8
-// CHECK: vrsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s16
-// CHECK: vrsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s32
-// CHECK: vrsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s64
-// CHECK: vrsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u8
-// CHECK: vrsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vrsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u16
-// CHECK: vrsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i16> [[TMP3]]
 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vrsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u32
-// CHECK: vrsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vrsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u64
-// CHECK: vrsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s8
-// CHECK: vrsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s16
-// CHECK: vrsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s32
-// CHECK: vrsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s64
-// CHECK: vrsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u8
-// CHECK: vrsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vrsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u16
-// CHECK: vrsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <8 x i16> [[TMP3]]
 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vrsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u32
-// CHECK: vrsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vrsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u64
-// CHECK: vrsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vrsraq_n_u64(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vrsubhn_s16
-// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_s32
-// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_s64
-// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u16
-// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u32
-// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u64
-// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vset_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
   return vset_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
   return vset_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
   return vset_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
   return vset_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
   return vset_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
   return vset_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
   return vset_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
   return vset_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_f32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
   return vset_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_f16
-// CHECK: mov 
+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   return vset_lane_f16(*a, b, 1);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
   return vsetq_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
   return vsetq_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
   return vsetq_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
   return vsetq_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
   return vsetq_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
   return vsetq_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
   return vsetq_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
   return vsetq_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
   return vsetq_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_s64
 // The optimizer is able to get rid of all moves now.
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
   return vset_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vset_lane_u64
 // The optimizer is able to get rid of all moves now.
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
   return vset_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
   return vsetq_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
   return vsetq_lane_u64(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vshl_s8
-// CHECK: vshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s16
-// CHECK: vshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s32
-// CHECK: vshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s64
-// CHECK: vshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u8
-// CHECK: vshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u16
-// CHECK: vshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u32
-// CHECK: vshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u64
-// CHECK: vshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s8
-// CHECK: vshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s16
-// CHECK: vshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s32
-// CHECK: vshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s64
-// CHECK: vshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u8
-// CHECK: vshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u16
-// CHECK: vshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u32
-// CHECK: vshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u64
-// CHECK: vshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vshll_n_s8
-// CHECK: vshll.s8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_s16
-// CHECK: vshll.s16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_s32
-// CHECK: vshll.s32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u8
-// CHECK: vshll.u8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u16
-// CHECK: vshll.u16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u32
-// CHECK: vshll.u32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 1);
 }
 
 
-// CHECK-LABEL: test_vshl_n_s8
-// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s16
-// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s32
-// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s64
-// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u8
-// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   return vshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u16
-// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   return vshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u32
-// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   return vshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u64
-// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s8
-// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s16
-// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s32
-// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s64
-// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u8
-// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   return vshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u16
-// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   return vshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u32
-// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   return vshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u64
-// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   return vshlq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vshrn_n_s16
-// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_s32
-// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_s64
-// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u16
-// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u32
-// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u64
-// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vshr_n_s8
-// CHECK: vshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s16
-// CHECK: vshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s32
-// CHECK: vshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s64
-// CHECK: vshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u8
-// CHECK: vshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   return vshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u16
-// CHECK: vshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   return vshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u32
-// CHECK: vshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   return vshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u64
-// CHECK: vshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s8
-// CHECK: vshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s16
-// CHECK: vshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s32
-// CHECK: vshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s64
-// CHECK: vshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u8
-// CHECK: vshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   return vshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u16
-// CHECK: vshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   return vshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u32
-// CHECK: vshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   return vshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u64
-// CHECK: vshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   return vshrq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vsli_n_s8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <4 x i16> [[VSLI_N]]2
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s32
-// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <2 x i32> [[VSLI_N]]2
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s64
-// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N]]2
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <4 x i16> [[VSLI_N]]2
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u32
-// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <2 x i32> [[VSLI_N]]2
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u64
-// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N]]2
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_p8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_p16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <4 x i16> [[VSLI_N]]2
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <8 x i16> [[VSLI_N]]2
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s32
-// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <4 x i32> [[VSLI_N]]2
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s64
-// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <2 x i64> [[VSLI_N]]2
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <8 x i16> [[VSLI_N]]2
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u32
-// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <4 x i32> [[VSLI_N]]2
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u64
-// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <2 x i64> [[VSLI_N]]2
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_p8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_p16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 1)
+// CHECK:   ret <8 x i16> [[VSLI_N]]2
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vsra_n_s8
-// CHECK: vsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s16
-// CHECK: vsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s32
-// CHECK: vsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s64
-// CHECK: vsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u8
-// CHECK: vsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u16
-// CHECK: vsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u32
-// CHECK: vsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u64
-// CHECK: vsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s8
-// CHECK: vsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s16
-// CHECK: vsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s32
-// CHECK: vsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s64
-// CHECK: vsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u8
-// CHECK: vsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u16
-// CHECK: vsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u32
-// CHECK: vsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u64
-// CHECK: vsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vsri_n_s8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <4 x i16> [[VSRI_N]]2
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s32
-// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <2 x i32> [[VSRI_N]]2
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s64
-// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N]]2
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsri_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <4 x i16> [[VSRI_N]]2
 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsri_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u32
-// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <2 x i32> [[VSRI_N]]2
 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsri_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u64
-// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N]]2
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_p8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_p16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <4 x i16> [[VSRI_N]]2
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <8 x i16> [[VSRI_N]]2
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s32
-// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <4 x i32> [[VSRI_N]]2
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s64
-// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <2 x i64> [[VSRI_N]]2
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsriq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <8 x i16> [[VSRI_N]]2
 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsriq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u32
-// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <4 x i32> [[VSRI_N]]2
 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsriq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u64
-// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <2 x i64> [[VSRI_N]]2
 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsriq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_p8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_p16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 1)
+// CHECK:   ret <8 x i16> [[VSRI_N]]2
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vst1q_u8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u64
-// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_s8(int8_t * a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s16(int16_t * a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s32(int32_t * a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s64
-// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s64(int64_t * a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_f16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f16(float16_t * a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_f32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   store <4 x float> [[TMP3]], <4 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f32(float32_t * a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_p8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_p16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u64
-// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_s8(int8_t * a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s16(int16_t * a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s32(int32_t * a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s64
-// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s64(int64_t * a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst1_f16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f16(float16_t * a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_f32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   store <2 x float> [[TMP3]], <2 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f32(float32_t * a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_p8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_p16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst1q_lane_u8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1q_lane_f16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_f32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_p8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_p16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_u8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_u16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_u32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_u64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vst1_lane_s8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_s16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_s32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_s64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vst1_lane_f16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_f32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_p8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_p16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vst2q_u8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_u16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_u32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_f16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_f32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_p8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_p16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst2_f16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_f32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_p8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_p16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst2q_lane_u16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_u32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_s16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_s32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_f16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_f32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_p16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_u8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_u16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_u32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_s8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_s16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_s32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_f16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_f32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_p8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_p16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vst3q_u8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_u16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_u32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_f16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_f32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_p8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_p16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst3_f16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_f32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_p8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_p16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst3q_lane_u16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_u32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_s16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_s32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_f16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_f32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_p16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_u8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_u16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_u32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_s8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_s16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_s32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_f16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_f32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_p8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_p16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vst4q_u8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_u16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_u32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_f16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_f32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_p8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_p16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst4_f16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_f32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_p8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_p16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst4q_lane_u16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_u32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_s16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_s32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_f16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_f32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_p16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_u8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_u16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_u32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_s8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_s16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_s32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_f16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_f32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_p8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_p16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vsub_s8
-// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
   return vsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s16
-// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
   return vsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s32
-// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
   return vsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s64
-// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
   return vsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsub_f32
-// CHECK: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
   return vsub_f32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u8
-// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
   return vsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u16
-// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
   return vsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u32
-// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
   return vsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u64
-// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
   return vsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s8
-// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
   return vsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s16
-// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
   return vsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s32
-// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
   return vsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s64
-// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
   return vsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_f32
-// CHECK: vsub.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
   return vsubq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u8
-// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u16
-// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u32
-// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u64
-// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vsubq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vsubhn_s16
-// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_s32
-// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_s64
-// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u16
-// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u32
-// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u64
-// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vsubl_s8
-// CHECK: vsubl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_s16
-// CHECK: vsubl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_s32
-// CHECK: vsubl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u8
-// CHECK: vsubl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u16
-// CHECK: vsubl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u32
-// CHECK: vsubl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vsubw_s8
-// CHECK: vsubw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_s16
-// CHECK: vsubw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_s32
-// CHECK: vsubw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u8
-// CHECK: vsubw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u16
-// CHECK: vsubw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u32
-// CHECK: vsubw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl1_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   return vtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl1_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   return vtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl1_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   return vtbl1_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl2_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   return vtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl2_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   return vtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl2_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   return vtbl2_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl3_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   return vtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl3_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   return vtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl3_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   return vtbl3_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl4_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   return vtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl4_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   return vtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl4_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   return vtbl4_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbx1_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #4
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx1_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #4
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx1_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #4
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   return vtbx1_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtbx2_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   return vtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx2_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   return vtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx2_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   return vtbx2_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtbx3_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #4
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   return vtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx3_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #4
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   return vtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx3_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #4
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   return vtbx3_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtbx4_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   return vtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx4_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   return vtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx4_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtrn_s8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_s16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_s32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_f32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_p8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_p16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_f32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_p8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_p16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vtst_s8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
   return vtst_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_s16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
   return vtst_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtst_s32
-// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
   return vtst_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
   return vtst_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
   return vtst_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u32
-// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
   return vtst_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtst_p8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
   return vtst_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_p16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
   return vtst_p16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
   return vtstq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
   return vtstq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s32
-// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
   return vtstq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
   return vtstq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
   return vtstq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u32
-// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
   return vtstq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_p8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
   return vtstq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_p16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vuzp_s8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_s16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_s32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_f32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_p8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_p16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_f32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_p8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_p16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vzip_s8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_s16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: test_vzip_s32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_f32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_p8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_p16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_f32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_p8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_p16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
Index: test/CodeGen/builtins-arm-exclusive.c
===================================================================
--- test/CodeGen/builtins-arm-exclusive.c
+++ test/CodeGen/builtins-arm-exclusive.c
@@ -1,32 +1,6 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -O3 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -O3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
-
-// Make sure the canonical use works before going into smaller details:
-int atomic_inc(int *addr) {
-  int Failure, OldVal;
-  do {
-    OldVal = __builtin_arm_ldrex(addr);
-    Failure = __builtin_arm_strex(OldVal + 1, addr);
-  } while (Failure);
-
-  return OldVal;
-}
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix=CHECK-ARM64
 
-// CHECK-LABEL: @atomic_inc
-// CHECK:   [[OLDVAL:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-// CHECK:   [[INC:%.*]] = add nsw i32 [[OLDVAL]], 1
-// CHECK:   [[FAILURE:%.*]] = tail call i32 @llvm.arm.strex.p0i32(i32 [[INC]], i32* %addr)
-// CHECK:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
-
-// CHECK-ARM64-LABEL: @atomic_inc
-// CHECK-ARM64:   [[OLDVAL:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
-// CHECK-ARM64:   [[INC:%.*]] = add i64 [[OLDVAL]], 1
-// CHECK-ARM64:   [[TRUNC:%.*]] = and i64 [[INC]], 4294967295
-// CHECK-ARM64:   [[FAILURE:%.*]] = tail call i32 @llvm.aarch64.stxr.p0i32(i64 [[TRUNC]], i32* %addr)
-// CHECK-ARM64:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK-ARM64:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
 
 struct Simple {
   char a, b;
@@ -37,36 +11,33 @@
 // CHECK-ARM64-LABEL: @test_ldrex
   int sum = 0;
   sum += __builtin_arm_ldrex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8
 
   sum += __builtin_arm_ldrex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16
 
 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: trunc i64 [[INTRES]] to i16
 
   sum += __builtin_arm_ldrex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK:  call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
 
 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32
 
   sum += __builtin_arm_ldrex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
@@ -79,16 +50,18 @@
 
   sum += __builtin_arm_ldrex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
 
   sum += __builtin_arm_ldrex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@@ -97,21 +70,31 @@
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double
 
   sum += *__builtin_arm_ldrex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
 
   sum += __builtin_arm_ldrex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
@@ -121,36 +104,33 @@
 // CHECK-ARM64-LABEL: @test_ldaex
   int sum = 0;
   sum += __builtin_arm_ldaex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8
 
   sum += __builtin_arm_ldaex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16
 
 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i16
 
   sum += __builtin_arm_ldaex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
 // CHECK:  call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
 
 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32
 
   sum += __builtin_arm_ldaex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
@@ -163,16 +143,18 @@
 
   sum += __builtin_arm_ldaex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
 
   sum += __builtin_arm_ldaex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@@ -181,21 +163,31 @@
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double
 
   sum += *__builtin_arm_ldaex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
 
   sum += __builtin_arm_ldaex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
@@ -225,27 +217,51 @@
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 42, i32* [[ADDR32]])
 
   res |= __builtin_arm_strex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 42, i64* [[ADDR64]])
 
   res |= __builtin_arm_strex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[TMP5]])
 
   res |= __builtin_arm_strex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 -266631570, i32 1074340345, i8* %addr)
-
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
+
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])
 
   res |= __builtin_arm_strex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])
 
-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])
 
   return res;
 }
@@ -275,27 +291,51 @@
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 42, i32* [[ADDR32]])
 
   res |= __builtin_arm_stlex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 42, i64* [[ADDR64]])
 
   res |= __builtin_arm_stlex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[TMP5]])
 
   res |= __builtin_arm_stlex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 -266631570, i32 1074340345, i8* %addr)
-
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
+
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])
 
   res |= __builtin_arm_stlex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])
 
-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])
 
   return res;
 }
@@ -317,7 +357,7 @@
 
   return __builtin_arm_ldrex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@@ -331,11 +371,13 @@
 // CHECK-ARM64-LABEL: @test_strex_128
 
   return __builtin_arm_strex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: call i32 @llvm.aarch64.stxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }
 
 __int128 test_ldaex_128(__int128 *addr) {
@@ -343,7 +385,7 @@
 
   return __builtin_arm_ldaex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@@ -357,11 +399,13 @@
 // CHECK-ARM64-LABEL: @test_stlex_128
 
   return __builtin_arm_stlex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stlxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: [[RES:%.*]] = call i32 @llvm.aarch64.stlxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }
 
 #endif
Index: test/CodeGen/builtins-arm.c
===================================================================
--- test/CodeGen/builtins-arm.c
+++ test/CodeGen/builtins-arm.c
@@ -1,5 +1,4 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 void *f0()
 {
@@ -87,14 +86,14 @@
 
 unsigned mrc() {
   // CHECK: define i32 @mrc()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
   // CHECK-NEXT: ret i32 [[R]]
   return __builtin_arm_mrc(15, 0, 13, 0, 3);
 }
 
 unsigned mrc2() {
   // CHECK: define i32 @mrc2()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
   // CHECK-NEXT: ret i32 [[R]]
   return __builtin_arm_mrc2(15, 0, 13, 0, 3);
 }
@@ -124,40 +123,40 @@
 }
 
 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !7)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M0:.*]])
   // CHECK-NEXT: ret i32 [[V0]]
   return __builtin_arm_rsr("cp1:2:c3:c4:5");
 }
 
 unsigned long long rsr64() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata !8)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M1:.*]])
   // CHECK-NEXT: ret i64 [[V0]]
   return __builtin_arm_rsr64("cp1:2:c3");
 }
 
 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !9)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M2:.*]])
   // CHECK-NEXT: [[V1:[%A-Za-z0-9.]+]] = inttoptr i32 [[V0]] to i8*
   // CHECK-NEXT: ret i8* [[V1]]
   return __builtin_arm_rsrp("sysreg");
 }
 
 void wsr(unsigned v) {
-  // CHECK: call void @llvm.write_register.i32(metadata !7, i32 %v)
+  // CHECK: call void @llvm.write_register.i32(metadata ![[M0]], i32 %v)
   __builtin_arm_wsr("cp1:2:c3:c4:5", v);
 }
 
 void wsr64(unsigned long long v) {
-  // CHECK: call void @llvm.write_register.i64(metadata !8, i64 %v)
+  // CHECK: call void @llvm.write_register.i64(metadata ![[M1]], i64 %v)
   __builtin_arm_wsr64("cp1:2:c3", v);
 }
 
 void wsrp(void *v) {
   // CHECK: [[V0:[%A-Za-z0-9.]+]] = ptrtoint i8* %v to i32
-  // CHECK-NEXT: call void @llvm.write_register.i32(metadata !9, i32 [[V0]])
+  // CHECK-NEXT: call void @llvm.write_register.i32(metadata ![[M2]], i32 [[V0]])
   __builtin_arm_wsrp("sysreg", v);
 }
 
-// CHECK: !7 = !{!"cp1:2:c3:c4:5"}
-// CHECK: !8 = !{!"cp1:2:c3"}
-// CHECK: !9 = !{!"sysreg"}
+// CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"}
+// CHECK: ![[M1]] = !{!"cp1:2:c3"}
+// CHECK: ![[M2]] = !{!"sysreg"}
Index: test/CodeGen/builtins-arm64.c
===================================================================
--- test/CodeGen/builtins-arm64.c
+++ test/CodeGen/builtins-arm64.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 void f0(void *a, void *b) {
 	__clear_cache(a,b);
@@ -50,7 +50,7 @@
 }
 
 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
   // CHECK-NEXT: trunc i64 [[V0]] to i32
   return __builtin_arm_rsr("1:2:3:4:5");
 }
@@ -61,7 +61,7 @@
 }
 
 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
   // CHECK-NEXT: inttoptr i64 [[V0]] to i8*
   return __builtin_arm_rsrp("1:2:3:4:5");
 }
Index: test/CodeGen/fp128_complex.c
===================================================================
--- test/CodeGen/fp128_complex.c
+++ test/CodeGen/fp128_complex.c
@@ -1,9 +1,9 @@
-// RUN: %clang -target aarch64-linux-gnuabi %s -O3 -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -target aarch64-linux-gnuabi %s -S -emit-llvm -o - | FileCheck %s
 
 _Complex long double a, b, c, d;
 void test_fp128_compound_assign(void) {
-  // CHECK: tail call { fp128, fp128 } @__multc3
+  // CHECK: call { fp128, fp128 } @__multc3
   a *= b;
-  // CHECK: tail call { fp128, fp128 } @__divtc3
+  // CHECK: call { fp128, fp128 } @__divtc3
   c /= d;
 }
Index: test/CodeGen/neon-immediate-ubsan.c
===================================================================
--- test/CodeGen/neon-immediate-ubsan.c
+++ test/CodeGen/neon-immediate-ubsan.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a8 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=ARMV7
 
-// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a53 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AARCH64