Index: lib/Basic/Targets/ARM.cpp
===================================================================
--- lib/Basic/Targets/ARM.cpp
+++ lib/Basic/Targets/ARM.cpp
@@ -440,8 +440,17 @@
       HW_FP |= HW_FP_HP;
     } else if (Feature == "+fullfp16") {
       HasLegalHalfType = true;
+      HW_FP |= HW_FP_SP | HW_FP_DP | HW_FP_HP;
+      FPU |= VFP4FPU;
     } else if (Feature == "+dotprod") {
+      FPU |= NeonFPU;
+      HW_FP |= HW_FP_SP | HW_FP_DP;
       DotProd = true;
+    } else if (Feature == "+fp16fml"){
+      HW_FP |= HW_FP_HP;
+      HasLegalHalfType = true;
+      FPU |= VFP4FPU;
+      HW_FP |= HW_FP_SP | HW_FP_DP | HW_FP_HP;
     }
   }
   HW_FP &= ~HW_FP_remove;
Index: test/CodeGen/arm_neon_intrinsics.c
===================================================================
--- test/CodeGen/arm_neon_intrinsics.c
+++ test/CodeGen/arm_neon_intrinsics.c
@@ -4,6 +4,9 @@
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
 // RUN:  | opt -S -mem2reg | FileCheck %s
 
+// RUN: %clang -O1 -target armv8a-linux-eabi -march=armv8a+fp16fml\
+// RUN:  -S -emit-llvm -o - %s | FileCheck %s.v8
+
 #include <arm_neon.h>
 
 // CHECK-LABEL: @test_vaba_s8(
Index: test/CodeGen/arm_neon_intrinsics.c.v8
===================================================================
--- /dev/null
+++ test/CodeGen/arm_neon_intrinsics.c.v8
@@ -0,0 +1,12234 @@
+// CHECK-LABEL: test_vaba_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %add.i = add <8 x i8> %vabd_v.i.i, %a
+// CHECK-NEXT:  ret <8 x i8> %add.i
+// CHECK-LABEL: test_vaba_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %add.i = add <4 x i16> %vabd_v2.i.i, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vaba_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %add.i = add <2 x i32> %vabd_v2.i.i, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vaba_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %add.i = add <8 x i8> %vabd_v.i.i, %a
+// CHECK-NEXT:  ret <8 x i8> %add.i
+// CHECK-LABEL: test_vaba_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %add.i = add <4 x i16> %vabd_v2.i.i, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vaba_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %add.i = add <2 x i32> %vabd_v2.i.i, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vabaq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v.i.i = tail call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c)
+// CHECK-NEXT:  %add.i = add <16 x i8> %vabdq_v.i.i, %a
+// CHECK-NEXT:  ret <16 x i8> %add.i
+// CHECK-LABEL: test_vabaq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i.i = tail call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c)
+// CHECK-NEXT:  %add.i = add <8 x i16> %vabdq_v2.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vabaq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i.i = tail call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c)
+// CHECK-NEXT:  %add.i = add <4 x i32> %vabdq_v2.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vabaq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v.i.i = tail call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c)
+// CHECK-NEXT:  %add.i = add <16 x i8> %vabdq_v.i.i, %a
+// CHECK-NEXT:  ret <16 x i8> %add.i
+// CHECK-LABEL: test_vabaq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i.i = tail call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c)
+// CHECK-NEXT:  %add.i = add <8 x i16> %vabdq_v2.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vabaq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i.i = tail call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c)
+// CHECK-NEXT:  %add.i = add <4 x i32> %vabdq_v2.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vabal_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %vmovl.i.i.i = zext <8 x i8> %vabd_v.i.i.i to <8 x i16>
+// CHECK-NEXT:  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vabal_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %vmovl.i.i.i = zext <4 x i16> %vabd_v2.i.i.i to <4 x i32>
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vabal_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %vmovl.i.i.i = zext <2 x i32> %vabd_v2.i.i.i to <2 x i64>
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vabal_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %vmovl.i.i.i = zext <8 x i8> %vabd_v.i.i.i to <8 x i16>
+// CHECK-NEXT:  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vabal_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %vmovl.i.i.i = zext <4 x i16> %vabd_v2.i.i.i to <4 x i32>
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vabal_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %vmovl.i.i.i = zext <2 x i32> %vabd_v2.i.i.i to <2 x i64>
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vabd_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vabd_v.i
+// CHECK-LABEL: test_vabd_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vabd_v2.i
+// CHECK-LABEL: test_vabd_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vabd_v2.i
+// CHECK-LABEL: test_vabd_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vabd_v.i
+// CHECK-LABEL: test_vabd_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vabd_v2.i
+// CHECK-LABEL: test_vabd_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vabd_v2.i
+// CHECK-LABEL: test_vabd_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i = tail call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vabd_v2.i
+// CHECK-LABEL: test_vabdq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v.i = tail call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vabdq_v.i
+// CHECK-LABEL: test_vabdq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i = tail call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vabdq_v2.i
+// CHECK-LABEL: test_vabdq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i = tail call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vabdq_v2.i
+// CHECK-LABEL: test_vabdq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v.i = tail call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vabdq_v.i
+// CHECK-LABEL: test_vabdq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i = tail call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vabdq_v2.i
+// CHECK-LABEL: test_vabdq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i = tail call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vabdq_v2.i
+// CHECK-LABEL: test_vabdq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabdq_v2.i = tail call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x float> %vabdq_v2.i
+// CHECK-LABEL: test_vabdl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  %vmovl.i.i = zext <8 x i8> %vabd_v.i.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vmovl.i.i
+// CHECK-LABEL: test_vabdl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  %vmovl.i.i = zext <4 x i16> %vabd_v2.i.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vmovl.i.i
+// CHECK-LABEL: test_vabdl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  %vmovl.i.i = zext <2 x i32> %vabd_v2.i.i to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %vmovl.i.i
+// CHECK-LABEL: test_vabdl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  %vmovl.i.i = zext <8 x i8> %vabd_v.i.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vmovl.i.i
+// CHECK-LABEL: test_vabdl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  %vmovl.i.i = zext <4 x i16> %vabd_v2.i.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vmovl.i.i
+// CHECK-LABEL: test_vabdl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabd_v2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  %vmovl.i.i = zext <2 x i32> %vabd_v2.i.i to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %vmovl.i.i
+// CHECK-LABEL: test_vabs_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vabs.i
+// CHECK-LABEL: test_vabs_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a)
+// CHECK-NEXT:  ret <4 x i16> %vabs1.i
+// CHECK-LABEL: test_vabs_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <2 x i32> %vabs1.i
+// CHECK-LABEL: test_vabs_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+// CHECK-NEXT:  ret <2 x float> %vabs1.i
+// CHECK-LABEL: test_vabsq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vabs.i
+// CHECK-LABEL: test_vabsq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i16> %vabs1.i
+// CHECK-LABEL: test_vabsq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i32> %vabs1.i
+// CHECK-LABEL: test_vabsq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
+// CHECK-NEXT:  ret <4 x float> %vabs1.i
+// CHECK-LABEL: test_vadd_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %add.i
+// CHECK-LABEL: test_vadd_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vadd_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vadd_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vadd_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = fadd <2 x float> %a, %b
+// CHECK-NEXT:  ret <2 x float> %add.i
+// CHECK-LABEL: test_vadd_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %add.i
+// CHECK-LABEL: test_vadd_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vadd_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vadd_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vaddq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %add.i
+// CHECK-LABEL: test_vaddq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vaddq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vaddq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vaddq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = fadd <4 x float> %a, %b
+// CHECK-NEXT:  ret <4 x float> %add.i
+// CHECK-LABEL: test_vaddq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %add.i
+// CHECK-LABEL: test_vaddq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vaddq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vaddq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %add.i = add <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vaddhn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vaddhn.i = add <8 x i16> %b, %a
+// CHECK-NEXT:  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK-NEXT:  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vaddhn2.i
+// CHECK-LABEL: test_vaddhn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vaddhn.i = add <4 x i32> %b, %a
+// CHECK-NEXT:  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+// CHECK-NEXT:  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vaddhn2.i
+// CHECK-LABEL: test_vaddhn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vaddhn.i = add <2 x i64> %b, %a
+// CHECK-NEXT:  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+// CHECK-NEXT:  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vaddhn2.i
+// CHECK-LABEL: test_vaddhn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vaddhn.i = add <8 x i16> %b, %a
+// CHECK-NEXT:  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK-NEXT:  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vaddhn2.i
+// CHECK-LABEL: test_vaddhn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vaddhn.i = add <4 x i32> %b, %a
+// CHECK-NEXT:  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+// CHECK-NEXT:  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vaddhn2.i
+// CHECK-LABEL: test_vaddhn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vaddhn.i = add <2 x i64> %b, %a
+// CHECK-NEXT:  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+// CHECK-NEXT:  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vaddhn2.i
+// CHECK-LABEL: test_vaddl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  %vmovl.i3.i = sext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %add.i = add nsw <8 x i16> %vmovl.i3.i, %vmovl.i.i
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vaddl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  %vmovl.i3.i = sext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %add.i = add nsw <4 x i32> %vmovl.i3.i, %vmovl.i.i
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vaddl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  %vmovl.i3.i = sext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %add.i = add nsw <2 x i64> %vmovl.i3.i, %vmovl.i.i
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vaddl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  %vmovl.i3.i = zext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %add.i = add nuw nsw <8 x i16> %vmovl.i3.i, %vmovl.i.i
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vaddl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  %vmovl.i3.i = zext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %add.i = add nuw nsw <4 x i32> %vmovl.i3.i, %vmovl.i.i
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vaddl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  %vmovl.i3.i = zext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %add.i = add nuw nsw <2 x i64> %vmovl.i3.i, %vmovl.i.i
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vaddw_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %add.i = add <8 x i16> %vmovl.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vaddw_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmovl.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vaddw_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmovl.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vaddw_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %add.i = add <8 x i16> %vmovl.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vaddw_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmovl.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vaddw_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmovl.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vand_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %and.i
+// CHECK-LABEL: test_vand_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %and.i
+// CHECK-LABEL: test_vand_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %and.i
+// CHECK-LABEL: test_vand_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %and.i
+// CHECK-LABEL: test_vand_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %and.i
+// CHECK-LABEL: test_vand_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %and.i
+// CHECK-LABEL: test_vand_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %and.i
+// CHECK-LABEL: test_vand_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %and.i
+// CHECK-LABEL: test_vandq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %and.i
+// CHECK-LABEL: test_vandq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %and.i
+// CHECK-LABEL: test_vandq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %and.i
+// CHECK-LABEL: test_vandq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %and.i
+// CHECK-LABEL: test_vandq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %and.i
+// CHECK-LABEL: test_vandq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %and.i
+// CHECK-LABEL: test_vandq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %and.i
+// CHECK-LABEL: test_vandq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %and.i = and <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %and.i
+// CHECK-LABEL: test_vbic_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %and.i = and <8 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i8> %and.i
+// CHECK-LABEL: test_vbic_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %and.i = and <4 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i16> %and.i
+// CHECK-LABEL: test_vbic_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK-NEXT:  %and.i = and <2 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i32> %and.i
+// CHECK-LABEL: test_vbic_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <1 x i64> %b, <i64 -1>
+// CHECK-NEXT:  %and.i = and <1 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <1 x i64> %and.i
+// CHECK-LABEL: test_vbic_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %and.i = and <8 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i8> %and.i
+// CHECK-LABEL: test_vbic_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %and.i = and <4 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i16> %and.i
+// CHECK-LABEL: test_vbic_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK-NEXT:  %and.i = and <2 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i32> %and.i
+// CHECK-LABEL: test_vbic_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <1 x i64> %b, <i64 -1>
+// CHECK-NEXT:  %and.i = and <1 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <1 x i64> %and.i
+// CHECK-LABEL: test_vbicq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %and.i = and <16 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <16 x i8> %and.i
+// CHECK-LABEL: test_vbicq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %and.i = and <8 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i16> %and.i
+// CHECK-LABEL: test_vbicq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:  %and.i = and <4 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i32> %and.i
+// CHECK-LABEL: test_vbicq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK-NEXT:  %and.i = and <2 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i64> %and.i
+// CHECK-LABEL: test_vbicq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %and.i = and <16 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <16 x i8> %and.i
+// CHECK-LABEL: test_vbicq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %and.i = and <8 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i16> %and.i
+// CHECK-LABEL: test_vbicq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:  %and.i = and <4 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i32> %and.i
+// CHECK-LABEL: test_vbicq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK-NEXT:  %and.i = and <2 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i64> %and.i
+// CHECK-LABEL: test_vbsl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vbsl_v.i
+// CHECK-LABEL: test_vbsl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <4 x i16> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <4 x i16> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %3
+// CHECK-LABEL: test_vbsl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <2 x i32> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <2 x i32> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %3
+// CHECK-LABEL: test_vbsl_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <1 x i64> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <1 x i64> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %3
+// CHECK-LABEL: test_vbsl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vbsl_v.i
+// CHECK-LABEL: test_vbsl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <4 x i16> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <4 x i16> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %3
+// CHECK-LABEL: test_vbsl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <2 x i32> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <2 x i32> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %3
+// CHECK-LABEL: test_vbsl_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <1 x i64> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <1 x i64> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %3
+// CHECK-LABEL: test_vbsl_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <2 x float> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <2 x float> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %3
+// CHECK-LABEL: test_vbsl_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vbsl_v.i
+// CHECK-LABEL: test_vbsl_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast <4 x i16> %b to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast <4 x i16> %c to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <8 x i8> %vbsl_v.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %3
+// CHECK-LABEL: test_vbslq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK-NEXT:  ret <16 x i8> %vbslq_v.i
+// CHECK-LABEL: test_vbslq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <8 x i16> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <8 x i16> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %3
+// CHECK-LABEL: test_vbslq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <4 x i32> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <4 x i32> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %3
+// CHECK-LABEL: test_vbslq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <2 x i64> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <2 x i64> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %3
+// CHECK-LABEL: test_vbslq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK-NEXT:  ret <16 x i8> %vbslq_v.i
+// CHECK-LABEL: test_vbslq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <8 x i16> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <8 x i16> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %3
+// CHECK-LABEL: test_vbslq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <4 x i32> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <4 x i32> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %3
+// CHECK-LABEL: test_vbslq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <2 x i64> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <2 x i64> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %3
+// CHECK-LABEL: test_vbslq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <4 x float> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <4 x float> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %3
+// CHECK-LABEL: test_vbslq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK-NEXT:  ret <16 x i8> %vbslq_v.i
+// CHECK-LABEL: test_vbslq_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  %1 = bitcast <8 x i16> %b to <16 x i8>
+// CHECK-NEXT:  %2 = bitcast <8 x i16> %c to <16 x i8>
+// CHECK-NEXT:  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+// CHECK-NEXT:  %3 = bitcast <16 x i8> %vbslq_v.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %3
+// CHECK-LABEL: test_vcage_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcage_v2.i = tail call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x i32> %vcage_v2.i
+// CHECK-LABEL: test_vcageq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcageq_v2.i = tail call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x i32> %vcageq_v2.i
+// CHECK-LABEL: test_vcagt_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcagt_v2.i = tail call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x i32> %vcagt_v2.i
+// CHECK-LABEL: test_vcagtq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcagtq_v2.i = tail call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x i32> %vcagtq_v2.i
+// CHECK-LABEL: test_vcale_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcale_v2.i = tail call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
+// CHECK-NEXT:  ret <2 x i32> %vcale_v2.i
+// CHECK-LABEL: test_vcaleq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcaleq_v2.i = tail call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
+// CHECK-NEXT:  ret <4 x i32> %vcaleq_v2.i
+// CHECK-LABEL: test_vcalt_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcalt_v2.i = tail call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
+// CHECK-NEXT:  ret <2 x i32> %vcalt_v2.i
+// CHECK-LABEL: test_vcaltq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcaltq_v2.i = tail call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
+// CHECK-NEXT:  ret <4 x i32> %vcaltq_v2.i
+// CHECK-LABEL: test_vceq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vceq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vceq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vceq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp oeq <2 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vceq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vceq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vceq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vceq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vceqq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vceqq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vceqq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vceqq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp oeq <4 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vceqq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vceqq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vceqq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vceqq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp eq <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcge_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sge <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vcge_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sge <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vcge_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sge <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcge_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp oge <2 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcge_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp uge <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vcge_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp uge <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vcge_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp uge <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcgeq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sge <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcgeq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sge <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcgeq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sge <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcgeq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp oge <4 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcgeq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp uge <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcgeq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp uge <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcgeq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp uge <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcgt_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sgt <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vcgt_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sgt <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vcgt_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sgt <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcgt_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp ogt <2 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcgt_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ugt <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vcgt_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ugt <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vcgt_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ugt <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcgtq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sgt <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcgtq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sgt <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcgtq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sgt <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcgtq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp ogt <4 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcgtq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ugt <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcgtq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ugt <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcgtq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ugt <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcle_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sle <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vcle_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sle <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vcle_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sle <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcle_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp ole <2 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcle_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ule <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vcle_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ule <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vcle_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ule <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcleq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sle <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcleq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sle <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcleq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp sle <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcleq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp ole <4 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcleq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ule <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcleq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ule <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcleq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ule <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcls_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcls_v.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vcls_v.i
+// CHECK-LABEL: test_vcls_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vcls_v1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+// CHECK-NEXT:  ret <4 x i16> %vcls_v1.i
+// CHECK-LABEL: test_vcls_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcls_v1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <2 x i32> %vcls_v1.i
+// CHECK-LABEL: test_vclsq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vclsq_v.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vclsq_v.i
+// CHECK-LABEL: test_vclsq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vclsq_v1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i16> %vclsq_v1.i
+// CHECK-LABEL: test_vclsq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vclsq_v1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i32> %vclsq_v1.i
+// CHECK-LABEL: test_vclt_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp slt <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vclt_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp slt <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vclt_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp slt <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vclt_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp olt <2 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vclt_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ult <8 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %sext.i
+// CHECK-LABEL: test_vclt_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ult <4 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %sext.i
+// CHECK-LABEL: test_vclt_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ult <2 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <2 x i1> %cmp.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %sext.i
+// CHECK-LABEL: test_vcltq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp slt <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcltq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp slt <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcltq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp slt <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcltq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = fcmp olt <4 x float> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vcltq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ult <16 x i8> %a, %b
+// CHECK-NEXT:  %sext.i = sext <16 x i1> %cmp.i to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %sext.i
+// CHECK-LABEL: test_vcltq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ult <8 x i16> %a, %b
+// CHECK-NEXT:  %sext.i = sext <8 x i1> %cmp.i to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %sext.i
+// CHECK-LABEL: test_vcltq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %cmp.i = icmp ult <4 x i32> %a, %b
+// CHECK-NEXT:  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %sext.i
+// CHECK-LABEL: test_vclz_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vclz_v.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
+// CHECK-NEXT:  ret <8 x i8> %vclz_v.i
+// CHECK-LABEL: test_vclz_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vclz_v1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
+// CHECK-NEXT:  ret <4 x i16> %vclz_v1.i
+// CHECK-LABEL: test_vclz_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vclz_v1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
+// CHECK-NEXT:  ret <2 x i32> %vclz_v1.i
+// CHECK-LABEL: test_vclz_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vclz_v.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
+// CHECK-NEXT:  ret <8 x i8> %vclz_v.i
+// CHECK-LABEL: test_vclz_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vclz_v1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
+// CHECK-NEXT:  ret <4 x i16> %vclz_v1.i
+// CHECK-LABEL: test_vclz_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vclz_v1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
+// CHECK-NEXT:  ret <2 x i32> %vclz_v1.i
+// CHECK-LABEL: test_vclzq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vclzq_v.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+// CHECK-NEXT:  ret <16 x i8> %vclzq_v.i
+// CHECK-LABEL: test_vclzq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vclzq_v1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+// CHECK-NEXT:  ret <8 x i16> %vclzq_v1.i
+// CHECK-LABEL: test_vclzq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vclzq_v1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+// CHECK-NEXT:  ret <4 x i32> %vclzq_v1.i
+// CHECK-LABEL: test_vclzq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vclzq_v.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+// CHECK-NEXT:  ret <16 x i8> %vclzq_v.i
+// CHECK-LABEL: test_vclzq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vclzq_v1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+// CHECK-NEXT:  ret <8 x i16> %vclzq_v1.i
+// CHECK-LABEL: test_vclzq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vclzq_v1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+// CHECK-NEXT:  ret <4 x i32> %vclzq_v1.i
+// CHECK-LABEL: test_vcnt_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcnt_v.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vcnt_v.i
+// CHECK-LABEL: test_vcnt_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcnt_v.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vcnt_v.i
+// CHECK-LABEL: test_vcnt_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcnt_v.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vcnt_v.i
+// CHECK-LABEL: test_vcntq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcntq_v.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vcntq_v.i
+// CHECK-LABEL: test_vcntq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcntq_v.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vcntq_v.i
+// CHECK-LABEL: test_vcntq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vcntq_v.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vcntq_v.i
+// CHECK-LABEL: test_vcombine_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vcombine_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vcombine_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x i32> %shuffle.i
+// CHECK-LABEL: test_vcombine_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x i64> %shuffle.i
+// CHECK-LABEL: test_vcombine_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x half> %shuffle.i
+// CHECK-LABEL: test_vcombine_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x float> %shuffle.i
+// CHECK-LABEL: test_vcombine_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vcombine_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vcombine_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x i32> %shuffle.i
+// CHECK-LABEL: test_vcombine_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x i64> %shuffle.i
+// CHECK-LABEL: test_vcombine_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vcombine_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vcreate_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <8 x i8>
+// CHECK-NEXT:  %vclz_v.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %0, i1 false)
+// CHECK-NEXT:  ret <8 x i8> %vclz_v.i
+// CHECK-LABEL: test_vcreate_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <4 x i16>
+// CHECK-NEXT:  %vclz_v1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %0, i1 false)
+// CHECK-NEXT:  ret <4 x i16> %vclz_v1.i
+// CHECK-LABEL: test_vcreate_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <2 x i32>
+// CHECK-NEXT:  %vclz_v1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %0, i1 false)
+// CHECK-NEXT:  ret <2 x i32> %vclz_v1.i
+// CHECK-LABEL: test_vcreate_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vcreate_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vcreate_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <8 x i8>
+// CHECK-NEXT:  %vclz_v.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %0, i1 false)
+// CHECK-NEXT:  ret <8 x i8> %vclz_v.i
+// CHECK-LABEL: test_vcreate_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <4 x i16>
+// CHECK-NEXT:  %vclz_v1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %0, i1 false)
+// CHECK-NEXT:  ret <4 x i16> %vclz_v1.i
+// CHECK-LABEL: test_vcreate_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <2 x i32>
+// CHECK-NEXT:  %vclz_v1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %0, i1 false)
+// CHECK-NEXT:  ret <2 x i32> %vclz_v1.i
+// CHECK-LABEL: test_vcreate_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %add.i = shl <1 x i64> %0, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vcreate_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <8 x i8>
+// CHECK-NEXT:  %vcnt_v.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %0)
+// CHECK-NEXT:  ret <8 x i8> %vcnt_v.i
+// CHECK-LABEL: test_vcreate_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64 %a to <8 x i8>
+// CHECK-NEXT:  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %0, <8 x i8> %0)
+// CHECK-NEXT:  %1 = bitcast <8 x i8> %vbsl_v.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %1
+// CHECK-LABEL: test_vcreate_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %add.i = shl <1 x i64> %0, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vcvt_f16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_f16_f321.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a)
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vcvt_f32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %vcvt.i
+// CHECK-LABEL: test_vcvt_f32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %vcvt.i
+// CHECK-LABEL: test_vcvtq_f32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %vcvt.i
+// CHECK-LABEL: test_vcvtq_f32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %vcvt.i
+// CHECK-LABEL: test_vcvt_f32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_f32_f16.i = bitcast <4 x half> %a to <4 x i16>
+// CHECK-NEXT:  %vcvt_f32_f161.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vcvt_f32_f16.i)
+// CHECK-NEXT:  ret <4 x float> %vcvt_f32_f161.i
+// CHECK-LABEL: test_vcvt_n_f32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 1)
+// CHECK-NEXT:  ret <2 x float> %vcvt_n1
+// CHECK-LABEL: test_vcvt_n_f32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 1)
+// CHECK-NEXT:  ret <2 x float> %vcvt_n1
+// CHECK-LABEL: test_vcvtq_n_f32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 3)
+// CHECK-NEXT:  ret <4 x float> %vcvt_n1
+// CHECK-LABEL: test_vcvtq_n_f32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 3)
+// CHECK-NEXT:  ret <4 x float> %vcvt_n1
+// CHECK-LABEL: test_vcvt_n_s32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 1)
+// CHECK-NEXT:  ret <2 x i32> %vcvt_n1
+// CHECK-LABEL: test_vcvtq_n_s32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 3)
+// CHECK-NEXT:  ret <4 x i32> %vcvt_n1
+// CHECK-LABEL: test_vcvt_n_u32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 1)
+// CHECK-NEXT:  ret <2 x i32> %vcvt_n1
+// CHECK-LABEL: test_vcvtq_n_u32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt_n1 = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 3)
+// CHECK-NEXT:  ret <4 x i32> %vcvt_n1
+// CHECK-LABEL: test_vcvt_s32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vcvt.i
+// CHECK-LABEL: test_vcvtq_s32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vcvt.i
+// CHECK-LABEL: test_vcvt_u32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vcvt.i
+// CHECK-LABEL: test_vcvtq_u32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vcvt.i
+// CHECK-LABEL: test_vdup_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:  ret <8 x i8> %shuffle
+// CHECK-LABEL: test_vdup_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  ret <4 x i16> %shuffle
+// CHECK-LABEL: test_vdup_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %shuffle
+// CHECK-LABEL: test_vdup_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:  ret <8 x i8> %shuffle
+// CHECK-LABEL: test_vdup_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  ret <4 x i16> %shuffle
+// CHECK-LABEL: test_vdup_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %shuffle
+// CHECK-LABEL: test_vdup_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:  ret <8 x i8> %shuffle
+// CHECK-LABEL: test_vdup_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  ret <4 x i16> %shuffle
+// CHECK-LABEL: test_vdup_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x float> %shuffle
+// CHECK-LABEL: test_vdupq_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <8 x i8> %a, <8 x i8> undef, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:  ret <16 x i8> %shuffle
+// CHECK-LABEL: test_vdupq_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %a, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  ret <8 x i16> %shuffle
+// CHECK-LABEL: test_vdupq_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %shuffle
+// CHECK-LABEL: test_vdupq_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <8 x i8> %a, <8 x i8> undef, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:  ret <16 x i8> %shuffle
+// CHECK-LABEL: test_vdupq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %a, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  ret <8 x i16> %shuffle
+// CHECK-LABEL: test_vdupq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %shuffle
+// CHECK-LABEL: test_vdupq_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <8 x i8> %a, <8 x i8> undef, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:  ret <16 x i8> %shuffle
+// CHECK-LABEL: test_vdupq_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %a, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  ret <8 x i16> %shuffle
+// CHECK-LABEL: test_vdupq_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x float> %shuffle
+// CHECK-LABEL: test_vdup_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <1 x i64> %a
+// CHECK-LABEL: test_vdup_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <1 x i64> %a
+// CHECK-LABEL: test_vdupq_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %shuffle
+// CHECK-LABEL: test_vdupq_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %shuffle
+// CHECK-LABEL: test_vdup_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %vecinit7.i
+// CHECK-LABEL: test_vdup_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %vecinit3.i
+// CHECK-LABEL: test_vdup_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i32> %vecinit1.i
+// CHECK-LABEL: test_vdup_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %vecinit7.i
+// CHECK-LABEL: test_vdup_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %vecinit3.i
+// CHECK-LABEL: test_vdup_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i32> %vecinit1.i
+// CHECK-LABEL: test_vdup_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %vecinit7.i
+// CHECK-LABEL: test_vdup_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %vecinit3.i
+// CHECK-LABEL: test_vdup_n_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2, !tbaa !3
+// CHECK-NEXT:  %vecinit = insertelement <4 x half> undef, half %0, i32 0
+// CHECK-NEXT:  %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x half> %vecinit3
+// CHECK-LABEL: test_vdup_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x float> undef, float %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x float> %vecinit.i, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x float> %vecinit1.i
+// CHECK-LABEL: test_vdupq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %vecinit15.i
+// CHECK-LABEL: test_vdupq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %vecinit7.i
+// CHECK-LABEL: test_vdupq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i32> %vecinit3.i
+// CHECK-LABEL: test_vdupq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %vecinit15.i
+// CHECK-LABEL: test_vdupq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %vecinit7.i
+// CHECK-LABEL: test_vdupq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i32> %vecinit3.i
+// CHECK-LABEL: test_vdupq_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %vecinit15.i
+// CHECK-LABEL: test_vdupq_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %vecinit7.i
+// CHECK-LABEL: test_vdupq_n_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2, !tbaa !3
+// CHECK-NEXT:  %vecinit = insertelement <8 x half> undef, half %0, i32 0
+// CHECK-NEXT:  %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x half> %vecinit7
+// CHECK-LABEL: test_vdupq_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x float> undef, float %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x float> %vecinit3.i
+// CHECK-LABEL: test_vdup_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %add.i = shl <1 x i64> %vecinit.i, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vdup_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %add.i = shl <1 x i64> %vecinit.i, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vdupq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %0 = shl <2 x i64> %vecinit.i, <i64 1, i64 1>
+// CHECK-NEXT:  %add.i = shufflevector <2 x i64> %0, <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vdupq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %0 = shl <2 x i64> %vecinit.i, <i64 1, i64 1>
+// CHECK-NEXT:  %add.i = shufflevector <2 x i64> %0, <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_veor_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %xor.i
+// CHECK-LABEL: test_veor_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %xor.i
+// CHECK-LABEL: test_veor_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %xor.i
+// CHECK-LABEL: test_veor_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %xor.i
+// CHECK-LABEL: test_veor_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %xor.i
+// CHECK-LABEL: test_veor_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %xor.i
+// CHECK-LABEL: test_veor_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %xor.i
+// CHECK-LABEL: test_veor_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %xor.i
+// CHECK-LABEL: test_veorq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %xor.i
+// CHECK-LABEL: test_veorq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %xor.i
+// CHECK-LABEL: test_veorq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %xor.i
+// CHECK-LABEL: test_veorq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %xor.i
+// CHECK-LABEL: test_veorq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %xor.i
+// CHECK-LABEL: test_veorq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %xor.i
+// CHECK-LABEL: test_veorq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %xor.i
+// CHECK-LABEL: test_veorq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %xor.i = xor <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %xor.i
+// CHECK-LABEL: test_vext_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:  ret <8 x i8> %vext
+// CHECK-LABEL: test_vext_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:  ret <8 x i8> %vext
+// CHECK-LABEL: test_vext_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:  ret <8 x i8> %vext
+// CHECK-LABEL: test_vext_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:  ret <4 x i16> %vext
+// CHECK-LABEL: test_vext_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:  ret <4 x i16> %vext
+// CHECK-LABEL: test_vext_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:  ret <4 x i16> %vext
+// CHECK-LABEL: test_vext_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:  ret <2 x i32> %vext
+// CHECK-LABEL: test_vext_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:  ret <2 x i32> %vext
+// CHECK-LABEL: test_vext_s64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <1 x i64> %a
+// CHECK-LABEL: test_vext_u64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <1 x i64> %a
+// CHECK-LABEL: test_vext_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:  ret <2 x float> %vext
+// CHECK-LABEL: test_vextq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK-NEXT:  ret <16 x i8> %vext
+// CHECK-LABEL: test_vextq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK-NEXT:  ret <16 x i8> %vext
+// CHECK-LABEL: test_vextq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK-NEXT:  ret <16 x i8> %vext
+// CHECK-LABEL: test_vextq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:  ret <8 x i16> %vext
+// CHECK-LABEL: test_vextq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:  ret <8 x i16> %vext
+// CHECK-LABEL: test_vextq_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:  ret <8 x i16> %vext
+// CHECK-LABEL: test_vextq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:  ret <4 x i32> %vext
+// CHECK-LABEL: test_vextq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:  ret <4 x i32> %vext
+// CHECK-LABEL: test_vextq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:  ret <2 x i64> %vext
+// CHECK-LABEL: test_vextq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:  ret <2 x i64> %vext
+// CHECK-LABEL: test_vextq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:  ret <4 x float> %vext
+// CHECK-LABEL: test_vfma_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a)
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vfmaq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a)
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vfms_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK-NEXT:  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %sub.i, <2 x float> %c, <2 x float> %a)
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vfmsq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK-NEXT:  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %c, <4 x float> %a)
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vget_high_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vget_high_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vget_high_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:  ret <2 x i32> %shuffle.i
+// CHECK-LABEL: test_vget_high_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK-NEXT:  ret <1 x i64> %shuffle.i
+// CHECK-LABEL: test_vget_high_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <4 x half> %shuffle.i
+// CHECK-LABEL: test_vget_high_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:  ret <2 x float> %shuffle.i
+// CHECK-LABEL: test_vget_high_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vget_high_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vget_high_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:  ret <2 x i32> %shuffle.i
+// CHECK-LABEL: test_vget_high_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK-NEXT:  ret <1 x i64> %shuffle.i
+// CHECK-LABEL: test_vget_high_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vget_high_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vget_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <8 x i8> %a, i32 7
+// CHECK-NEXT:  ret i8 %vget_lane
+// CHECK-LABEL: test_vget_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <4 x i16> %a, i32 3
+// CHECK-NEXT:  ret i16 %vget_lane
+// CHECK-LABEL: test_vget_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <2 x i32> %a, i32 1
+// CHECK-NEXT:  ret i32 %vget_lane
+// CHECK-LABEL: test_vget_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <8 x i8> %a, i32 7
+// CHECK-NEXT:  ret i8 %vget_lane
+// CHECK-LABEL: test_vget_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <4 x i16> %a, i32 3
+// CHECK-NEXT:  ret i16 %vget_lane
+// CHECK-LABEL: test_vget_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <2 x i32> %a, i32 1
+// CHECK-NEXT:  ret i32 %vget_lane
+// CHECK-LABEL: test_vget_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <8 x i8> %a, i32 7
+// CHECK-NEXT:  ret i8 %vget_lane
+// CHECK-LABEL: test_vget_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <4 x i16> %a, i32 3
+// CHECK-NEXT:  ret i16 %vget_lane
+// CHECK-LABEL: test_vget_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <2 x float> %a, i32 1
+// CHECK-NEXT:  ret float %vget_lane
+// CHECK-LABEL: test_vget_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x half> %a, i32 1
+// CHECK-NEXT:  %conv = fpext half %0 to float
+// CHECK-NEXT:  ret float %conv
+// CHECK-LABEL: test_vgetq_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <16 x i8> %a, i32 15
+// CHECK-NEXT:  ret i8 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <8 x i16> %a, i32 7
+// CHECK-NEXT:  ret i16 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <4 x i32> %a, i32 3
+// CHECK-NEXT:  ret i32 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <16 x i8> %a, i32 15
+// CHECK-NEXT:  ret i8 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <8 x i16> %a, i32 7
+// CHECK-NEXT:  ret i16 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <4 x i32> %a, i32 3
+// CHECK-NEXT:  ret i32 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <16 x i8> %a, i32 15
+// CHECK-NEXT:  ret i8 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <8 x i16> %a, i32 7
+// CHECK-NEXT:  ret i16 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <4 x float> %a, i32 3
+// CHECK-NEXT:  ret float %vget_lane
+// CHECK-LABEL: test_vgetq_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x half> %a, i32 3
+// CHECK-NEXT:  %conv = fpext half %0 to float
+// CHECK-NEXT:  ret float %conv
+// CHECK-LABEL: test_vget_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <1 x i64> %a, i32 0
+// CHECK-NEXT:  ret i64 %vget_lane
+// CHECK-LABEL: test_vget_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <1 x i64> %a, i32 0
+// CHECK-NEXT:  ret i64 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <2 x i64> %a, i32 1
+// CHECK-NEXT:  ret i64 %vget_lane
+// CHECK-LABEL: test_vgetq_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vget_lane = extractelement <2 x i64> %a, i32 1
+// CHECK-NEXT:  ret i64 %vget_lane
+// CHECK-LABEL: test_vget_low_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vget_low_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vget_low_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %shuffle.i
+// CHECK-LABEL: test_vget_low_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK-NEXT:  ret <1 x i64> %shuffle.i
+// CHECK-LABEL: test_vget_low_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x half> %shuffle.i
+// CHECK-LABEL: test_vget_low_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x float> %shuffle.i
+// CHECK-LABEL: test_vget_low_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vget_low_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vget_low_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %shuffle.i
+// CHECK-LABEL: test_vget_low_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK-NEXT:  ret <1 x i64> %shuffle.i
+// CHECK-LABEL: test_vget_low_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vget_low_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vhadd_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhadd_v.i = tail call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vhadd_v.i
+// CHECK-LABEL: test_vhadd_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vhadd_v2.i
+// CHECK-LABEL: test_vhadd_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vhadd_v2.i
+// CHECK-LABEL: test_vhadd_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhadd_v.i = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vhadd_v.i
+// CHECK-LABEL: test_vhadd_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vhadd_v2.i
+// CHECK-LABEL: test_vhadd_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vhadd_v2.i
+// CHECK-LABEL: test_vhaddq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhaddq_v.i = tail call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vhaddq_v.i
+// CHECK-LABEL: test_vhaddq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhaddq_v2.i = tail call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vhaddq_v2.i
+// CHECK-LABEL: test_vhaddq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhaddq_v2.i = tail call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vhaddq_v2.i
+// CHECK-LABEL: test_vhaddq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhaddq_v.i = tail call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vhaddq_v.i
+// CHECK-LABEL: test_vhaddq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhaddq_v2.i = tail call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vhaddq_v2.i
+// CHECK-LABEL: test_vhaddq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhaddq_v2.i = tail call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vhaddq_v2.i
+// CHECK-LABEL: test_vhsub_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsub_v.i = tail call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vhsub_v.i
+// CHECK-LABEL: test_vhsub_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsub_v2.i = tail call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vhsub_v2.i
+// CHECK-LABEL: test_vhsub_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsub_v2.i = tail call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vhsub_v2.i
+// CHECK-LABEL: test_vhsub_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsub_v.i = tail call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vhsub_v.i
+// CHECK-LABEL: test_vhsub_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsub_v2.i = tail call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vhsub_v2.i
+// CHECK-LABEL: test_vhsub_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsub_v2.i = tail call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vhsub_v2.i
+// CHECK-LABEL: test_vhsubq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsubq_v.i = tail call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vhsubq_v.i
+// CHECK-LABEL: test_vhsubq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsubq_v2.i = tail call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vhsubq_v2.i
+// CHECK-LABEL: test_vhsubq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsubq_v2.i = tail call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vhsubq_v2.i
+// CHECK-LABEL: test_vhsubq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsubq_v.i = tail call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vhsubq_v.i
+// CHECK-LABEL: test_vhsubq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsubq_v2.i = tail call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vhsubq_v2.i
+// CHECK-LABEL: test_vhsubq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vhsubq_v2.i = tail call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vhsubq_v2.i
+// CHECK-LABEL: test_vld1q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i8* %a to <16 x i8>*
+// CHECK-NEXT:  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+// CHECK-NEXT:  ret <16 x i8> %1
+// CHECK-LABEL: test_vld1q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to <8 x i16>*
+// CHECK-NEXT:  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+// CHECK-NEXT:  ret <8 x i16> %1
+// CHECK-LABEL: test_vld1q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to <4 x i32>*
+// CHECK-NEXT:  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+// CHECK-NEXT:  ret <4 x i32> %1
+// CHECK-LABEL: test_vld1q_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to <2 x i64>*
+// CHECK-NEXT:  %1 = load <2 x i64>, <2 x i64>* %0, align 8
+// CHECK-NEXT:  ret <2 x i64> %1
+// CHECK-LABEL: test_vld1q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i8* %a to <16 x i8>*
+// CHECK-NEXT:  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+// CHECK-NEXT:  ret <16 x i8> %1
+// CHECK-LABEL: test_vld1q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to <8 x i16>*
+// CHECK-NEXT:  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+// CHECK-NEXT:  ret <8 x i16> %1
+// CHECK-LABEL: test_vld1q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to <4 x i32>*
+// CHECK-NEXT:  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+// CHECK-NEXT:  ret <4 x i32> %1
+// CHECK-LABEL: test_vld1q_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to <2 x i64>*
+// CHECK-NEXT:  %1 = load <2 x i64>, <2 x i64>* %0, align 8
+// CHECK-NEXT:  ret <2 x i64> %1
+// CHECK-LABEL: test_vld1q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to <8 x half>*
+// CHECK-NEXT:  %1 = load <8 x half>, <8 x half>* %0, align 2
+// CHECK-NEXT:  ret <8 x half> %1
+// CHECK-LABEL: test_vld1q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to <4 x float>*
+// CHECK-NEXT:  %1 = load <4 x float>, <4 x float>* %0, align 4
+// CHECK-NEXT:  ret <4 x float> %1
+// CHECK-LABEL: test_vld1q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i8* %a to <16 x i8>*
+// CHECK-NEXT:  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+// CHECK-NEXT:  ret <16 x i8> %1
+// CHECK-LABEL: test_vld1q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to <8 x i16>*
+// CHECK-NEXT:  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+// CHECK-NEXT:  ret <8 x i16> %1
+// CHECK-LABEL: test_vld1_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i8* %a to <8 x i8>*
+// CHECK-NEXT:  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+// CHECK-NEXT:  ret <8 x i8> %1
+// CHECK-LABEL: test_vld1_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to <4 x i16>*
+// CHECK-NEXT:  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+// CHECK-NEXT:  ret <4 x i16> %1
+// CHECK-LABEL: test_vld1_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to <2 x i32>*
+// CHECK-NEXT:  %1 = load <2 x i32>, <2 x i32>* %0, align 4
+// CHECK-NEXT:  ret <2 x i32> %1
+// CHECK-LABEL: test_vld1_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to <1 x i64>*
+// CHECK-NEXT:  %1 = load <1 x i64>, <1 x i64>* %0, align 8
+// CHECK-NEXT:  ret <1 x i64> %1
+// CHECK-LABEL: test_vld1_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i8* %a to <8 x i8>*
+// CHECK-NEXT:  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+// CHECK-NEXT:  ret <8 x i8> %1
+// CHECK-LABEL: test_vld1_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to <4 x i16>*
+// CHECK-NEXT:  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+// CHECK-NEXT:  ret <4 x i16> %1
+// CHECK-LABEL: test_vld1_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to <2 x i32>*
+// CHECK-NEXT:  %1 = load <2 x i32>, <2 x i32>* %0, align 4
+// CHECK-NEXT:  ret <2 x i32> %1
+// CHECK-LABEL: test_vld1_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to <1 x i64>*
+// CHECK-NEXT:  %1 = load <1 x i64>, <1 x i64>* %0, align 8
+// CHECK-NEXT:  ret <1 x i64> %1
+// CHECK-LABEL: test_vld1_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to <4 x half>*
+// CHECK-NEXT:  %1 = load <4 x half>, <4 x half>* %0, align 2
+// CHECK-NEXT:  ret <4 x half> %1
+// CHECK-LABEL: test_vld1_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to <2 x float>*
+// CHECK-NEXT:  %1 = load <2 x float>, <2 x float>* %0, align 4
+// CHECK-NEXT:  ret <2 x float> %1
+// CHECK-LABEL: test_vld1_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i8* %a to <8 x i8>*
+// CHECK-NEXT:  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+// CHECK-NEXT:  ret <8 x i8> %1
+// CHECK-LABEL: test_vld1_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to <4 x i16>*
+// CHECK-NEXT:  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+// CHECK-NEXT:  ret <4 x i16> %1
+// CHECK-LABEL: test_vld1q_dup_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %lane
+// CHECK-LABEL: test_vld1q_dup_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %lane
+// CHECK-LABEL: test_vld1q_dup_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i32> %lane
+// CHECK-LABEL: test_vld1q_dup_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i64, i64* %a, align 8
+// CHECK-NEXT:  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %lane
+// CHECK-LABEL: test_vld1q_dup_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %lane
+// CHECK-LABEL: test_vld1q_dup_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %lane
+// CHECK-LABEL: test_vld1q_dup_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i32> %lane
+// CHECK-LABEL: test_vld1q_dup_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i64, i64* %a, align 8
+// CHECK-NEXT:  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %lane
+// CHECK-LABEL: test_vld1q_dup_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <8 x half> undef, half %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x half> %lane
+// CHECK-LABEL: test_vld1q_dup_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load float, float* %a, align 4
+// CHECK-NEXT:  %1 = insertelement <4 x float> undef, float %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x float> %lane
+// CHECK-LABEL: test_vld1q_dup_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %lane
+// CHECK-LABEL: test_vld1q_dup_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %lane
+// CHECK-LABEL: test_vld1_dup_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %lane
+// CHECK-LABEL: test_vld1_dup_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %lane
+// CHECK-LABEL: test_vld1_dup_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i32> %lane
+// CHECK-LABEL: test_vld1_dup_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i64, i64* %a, align 8
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+// CHECK-NEXT:  ret <1 x i64> %1
+// CHECK-LABEL: test_vld1_dup_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %lane
+// CHECK-LABEL: test_vld1_dup_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %lane
+// CHECK-LABEL: test_vld1_dup_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i32> %lane
+// CHECK-LABEL: test_vld1_dup_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i64, i64* %a, align 8
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+// CHECK-NEXT:  ret <1 x i64> %1
+// CHECK-LABEL: test_vld1_dup_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <4 x half> undef, half %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x half> %lane
+// CHECK-LABEL: test_vld1_dup_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load float, float* %a, align 4
+// CHECK-NEXT:  %1 = insertelement <2 x float> undef, float %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x float> %lane
+// CHECK-LABEL: test_vld1_dup_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %lane
+// CHECK-LABEL: test_vld1_dup_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+// CHECK-NEXT:  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %lane
+// CHECK-LABEL: test_vld1q_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+// CHECK-NEXT:  ret <16 x i8> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+// CHECK-NEXT:  ret <8 x i16> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+// CHECK-NEXT:  ret <4 x i32> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = shufflevector <2 x i64> %b, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK-NEXT:  %1 = bitcast i64* %a to <1 x i64>*
+// CHECK-NEXT:  %2 = load <1 x i64>, <1 x i64>* %1, align 8
+// CHECK-NEXT:  %vld1q_lane = shufflevector <1 x i64> %0, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x i64> %vld1q_lane
+// CHECK-LABEL: test_vld1q_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+// CHECK-NEXT:  ret <16 x i8> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+// CHECK-NEXT:  ret <8 x i16> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+// CHECK-NEXT:  ret <4 x i32> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = shufflevector <2 x i64> %b, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK-NEXT:  %1 = bitcast i64* %a to <1 x i64>*
+// CHECK-NEXT:  %2 = load <1 x i64>, <1 x i64>* %1, align 8
+// CHECK-NEXT:  %vld1q_lane = shufflevector <1 x i64> %0, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:  ret <2 x i64> %vld1q_lane
+// CHECK-LABEL: test_vld1q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x half> %b, half %0, i32 7
+// CHECK-NEXT:  ret <8 x half> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load float, float* %a, align 4
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+// CHECK-NEXT:  ret <4 x float> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+// CHECK-NEXT:  ret <16 x i8> %vld1_lane
+// CHECK-LABEL: test_vld1q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+// CHECK-NEXT:  ret <8 x i16> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+// CHECK-NEXT:  ret <8 x i8> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+// CHECK-NEXT:  ret <4 x i16> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+// CHECK-NEXT:  ret <2 x i32> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i64, i64* %a, align 8
+// CHECK-NEXT:  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+// CHECK-NEXT:  ret <1 x i64> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+// CHECK-NEXT:  ret <8 x i8> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+// CHECK-NEXT:  ret <4 x i16> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i32, i32* %a, align 4
+// CHECK-NEXT:  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+// CHECK-NEXT:  ret <2 x i32> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i64, i64* %a, align 8
+// CHECK-NEXT:  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+// CHECK-NEXT:  ret <1 x i64> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x half> %b, half %0, i32 3
+// CHECK-NEXT:  ret <4 x half> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load float, float* %a, align 4
+// CHECK-NEXT:  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+// CHECK-NEXT:  ret <2 x float> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i8, i8* %a, align 1
+// CHECK-NEXT:  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+// CHECK-NEXT:  ret <8 x i8> %vld1_lane
+// CHECK-LABEL: test_vld1_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load i16, i16* %a, align 2
+// CHECK-NEXT:  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+// CHECK-NEXT:  ret <4 x i16> %vld1_lane
+// CHECK-LABEL: test_vld2q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld2q_v = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld2q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld2q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld2q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld2q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld2q_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld2q_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld2q_v = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld2q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld2q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld2q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld2q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld2q_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld2q_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2.v8f16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <8 x half>, <8 x half> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <8 x half>, <8 x half> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x half> %vld2q_v.fca.0.extract, <8 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x half> %vld2q_v.fca.1.extract, <8 x half>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x float> %vld2q_v.fca.0.extract, <4 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x float> %vld2q_v.fca.1.extract, <4 x float>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld2q_v = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld2q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld2q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2q_v = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_v, 0
+// CHECK-NEXT:  %vld2q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld2q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld2q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld2_v = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld2_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld2_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld2_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld2_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld2_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld2_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* %0, i32 8)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <1 x i64> %vld2_v.fca.0.extract, <1 x i64>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <1 x i64> %vld2_v.fca.1.extract, <1 x i64>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld2_v = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld2_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld2_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld2_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld2_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld2_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld2_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* %0, i32 8)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <1 x i64> %vld2_v.fca.0.extract, <1 x i64>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <1 x i64> %vld2_v.fca.1.extract, <1 x i64>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2.v4f16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <4 x half>, <4 x half> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <4 x half>, <4 x half> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x half> %vld2_v.fca.0.extract, <4 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x half> %vld2_v.fca.1.extract, <4 x half>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x float> %vld2_v.fca.0.extract, <2 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x float> %vld2_v.fca.1.extract, <2 x float>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld2_v = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld2_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld2_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2_v = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld2_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_v, 0
+// CHECK-NEXT:  %vld2_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld2_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld2_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld2q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld2q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %4, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld2q_lane_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld2q_lane_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld2q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld2q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %4, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld2q_lane_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld2q_lane_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2lane.v8f16.p0i8(i8* %4, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <8 x half>, <8 x half> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <8 x half>, <8 x half> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x half> %vld2q_lane_v.fca.0.extract, <8 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x half> %vld2q_lane_v.fca.1.extract, <8 x half>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %4, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x float> %vld2q_lane_v.fca.0.extract, <4 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x float> %vld2q_lane_v.fca.1.extract, <4 x float>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2q_lane_v = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld2q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_lane_v, 0
+// CHECK-NEXT:  %vld2q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2q_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld2q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld2q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %vld2_lane_v = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 7, i32 1)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx3 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld2_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx3, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld2_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 3, i32 2)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld2_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld2_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %2, <2 x i32> %0, <2 x i32> %1, i32 1, i32 4)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld2_lane_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld2_lane_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %vld2_lane_v = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 7, i32 1)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx3 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld2_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx3, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld2_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 3, i32 2)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld2_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld2_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %2, <2 x i32> %0, <2 x i32> %1, i32 1, i32 4)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld2_lane_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld2_lane_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2lane.v4f16.p0i8(i8* %2, <4 x half> %0, <4 x half> %1, i32 3, i32 2)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <4 x half>, <4 x half> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <4 x half>, <4 x half> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x half> %vld2_lane_v.fca.0.extract, <4 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x half> %vld2_lane_v.fca.1.extract, <4 x half>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %2, <2 x float> %0, <2 x float> %1, i32 1, i32 4)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x float> %vld2_lane_v.fca.0.extract, <2 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x float> %vld2_lane_v.fca.1.extract, <2 x float>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %vld2_lane_v = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 7, i32 1)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx3 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld2_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx3, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld2_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld2_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld2_lane_v = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 3, i32 2)
+// CHECK-NEXT:  %vld2_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane_v, 0
+// CHECK-NEXT:  %vld2_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane_v, 1
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld2_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld2_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld3q_v = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.2.extract, <16 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld3q_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld3q_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld3q_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld3q_v = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.2.extract, <16 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld3q_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld3q_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld3q_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3.v8f16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <8 x half>, <8 x half>, <8 x half> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <8 x half>, <8 x half>, <8 x half> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <8 x half>, <8 x half>, <8 x half> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x half> %vld3q_v.fca.0.extract, <8 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x half> %vld3q_v.fca.1.extract, <8 x half>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x half> %vld3q_v.fca.2.extract, <8 x half>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x float> %vld3q_v.fca.0.extract, <4 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x float> %vld3q_v.fca.1.extract, <4 x float>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x float> %vld3q_v.fca.2.extract, <4 x float>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld3q_v = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <16 x i8> %vld3q_v.fca.2.extract, <16 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3q_v = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 0
+// CHECK-NEXT:  %vld3q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 1
+// CHECK-NEXT:  %vld3q_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld3q_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld3_v = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld3_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld3_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld3_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* %0, i32 8)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <1 x i64> %vld3_v.fca.0.extract, <1 x i64>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <1 x i64> %vld3_v.fca.1.extract, <1 x i64>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <1 x i64> %vld3_v.fca.2.extract, <1 x i64>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld3_v = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld3_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld3_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld3_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* %0, i32 8)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <1 x i64> %vld3_v.fca.0.extract, <1 x i64>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <1 x i64> %vld3_v.fca.1.extract, <1 x i64>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <1 x i64> %vld3_v.fca.2.extract, <1 x i64>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3.v4f16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <4 x half>, <4 x half>, <4 x half> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <4 x half>, <4 x half>, <4 x half> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <4 x half>, <4 x half>, <4 x half> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x half> %vld3_v.fca.0.extract, <4 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x half> %vld3_v.fca.1.extract, <4 x half>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x half> %vld3_v.fca.2.extract, <4 x half>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x float> %vld3_v.fca.0.extract, <2 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x float> %vld3_v.fca.1.extract, <2 x float>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x float> %vld3_v.fca.2.extract, <2 x float>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld3_v = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld3_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3_v = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld3_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 0
+// CHECK-NEXT:  %vld3_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 1
+// CHECK-NEXT:  %vld3_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld3_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %6, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld3q_lane_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld3q_lane_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld3q_lane_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %6, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld3q_lane_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld3q_lane_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld3q_lane_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x half> %4, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x half> %5, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x half> %b.sroa.6.32.vec.expand, <8 x half> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3lane.v8f16.p0i8(i8* %6, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, <8 x half> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <8 x half>, <8 x half>, <8 x half> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <8 x half>, <8 x half>, <8 x half> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <8 x half>, <8 x half>, <8 x half> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x half> %vld3q_lane_v.fca.0.extract, <8 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x half> %vld3q_lane_v.fca.1.extract, <8 x half>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x half> %vld3q_lane_v.fca.2.extract, <8 x half>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x float> %b.sroa.6.32.vec.expand, <4 x float> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %6, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, <4 x float> %b.sroa.6.40.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x float> %vld3q_lane_v.fca.0.extract, <4 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x float> %vld3q_lane_v.fca.1.extract, <4 x float>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x float> %vld3q_lane_v.fca.2.extract, <4 x float>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3q_lane_v = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld3q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 0
+// CHECK-NEXT:  %vld3q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 1
+// CHECK-NEXT:  %vld3q_lane_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3q_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld3q_lane_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %vld3_lane_v = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx5 = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx5, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx11 = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx11, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 3, i32 2)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %3, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld3_lane_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld3_lane_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld3_lane_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %vld3_lane_v = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx5 = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx5, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx11 = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx11, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 3, i32 2)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %3, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld3_lane_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld3_lane_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld3_lane_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %3 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3lane.v4f16.p0i8(i8* %3, <4 x half> %0, <4 x half> %1, <4 x half> %2, i32 3, i32 2)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <4 x half>, <4 x half>, <4 x half> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <4 x half>, <4 x half>, <4 x half> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <4 x half>, <4 x half>, <4 x half> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x half> %vld3_lane_v.fca.0.extract, <4 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x half> %vld3_lane_v.fca.1.extract, <4 x half>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x half> %vld3_lane_v.fca.2.extract, <4 x half>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %3 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %3, <2 x float> %0, <2 x float> %1, <2 x float> %2, i32 1, i32 4)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x float> %vld3_lane_v.fca.0.extract, <2 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x float> %vld3_lane_v.fca.1.extract, <2 x float>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x float> %vld3_lane_v.fca.2.extract, <2 x float>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %vld3_lane_v = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx5 = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx5, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx11 = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld3_lane_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx11, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld3_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld3_lane_v = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 3, i32 2)
+// CHECK-NEXT:  %vld3_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 0
+// CHECK-NEXT:  %vld3_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 1
+// CHECK-NEXT:  %vld3_lane_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane_v, 2
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx6 = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx6, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx8 = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld3_lane_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx8, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld4q_v = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.2.extract, <16 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx10 = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.3.extract, <16 x i8>* %__ret.sroa.6.0..sroa_idx10, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.3.extract, <8 x i16>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.3.extract, <4 x i32>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld4q_v = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.2.extract, <16 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx10 = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.3.extract, <16 x i8>* %__ret.sroa.6.0..sroa_idx10, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.3.extract, <8 x i16>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i32> %vld4q_v.fca.3.extract, <4 x i32>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4.v8f16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x half> %vld4q_v.fca.0.extract, <8 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x half> %vld4q_v.fca.1.extract, <8 x half>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x half> %vld4q_v.fca.2.extract, <8 x half>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x half> %vld4q_v.fca.3.extract, <8 x half>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x float> %vld4q_v.fca.0.extract, <4 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x float> %vld4q_v.fca.1.extract, <4 x float>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x float> %vld4q_v.fca.2.extract, <4 x float>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x float> %vld4q_v.fca.3.extract, <4 x float>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld4q_v = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.0.extract, <16 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.1.extract, <16 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.2.extract, <16 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx10 = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <16 x i8> %vld4q_v.fca.3.extract, <16 x i8>* %__ret.sroa.6.0..sroa_idx10, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4q_v = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4q_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 0
+// CHECK-NEXT:  %vld4q_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 1
+// CHECK-NEXT:  %vld4q_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 2
+// CHECK-NEXT:  %vld4q_v.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i16> %vld4q_v.fca.3.extract, <8 x i16>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld4_v = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx10 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.3.extract, <8 x i8>* %__ret.sroa.6.0..sroa_idx10, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.3.extract, <4 x i16>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.3.extract, <2 x i32>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* %0, i32 8)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.0.extract, <1 x i64>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.1.extract, <1 x i64>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.2.extract, <1 x i64>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.3.extract, <1 x i64>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld4_v = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx10 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.3.extract, <8 x i8>* %__ret.sroa.6.0..sroa_idx10, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.3.extract, <4 x i16>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <2 x i32> %vld4_v.fca.3.extract, <2 x i32>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* %0, i32 8)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.0.extract, <1 x i64>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.1.extract, <1 x i64>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.2.extract, <1 x i64>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <1 x i64> %vld4_v.fca.3.extract, <1 x i64>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4.v4f16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x half> %vld4_v.fca.0.extract, <4 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x half> %vld4_v.fca.1.extract, <4 x half>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x half> %vld4_v.fca.2.extract, <4 x half>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x half> %vld4_v.fca.3.extract, <4 x half>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0i8(i8* %0, i32 4)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x float> %vld4_v.fca.0.extract, <2 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x float> %vld4_v.fca.1.extract, <2 x float>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x float> %vld4_v.fca.2.extract, <2 x float>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <2 x float> %vld4_v.fca.3.extract, <2 x float>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vld4_v = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx1 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx1, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx4 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx7 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx10 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i8> %vld4_v.fca.3.extract, <8 x i8>* %__ret.sroa.6.0..sroa_idx10, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4_v = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* %0, i32 2)
+// CHECK-NEXT:  %vld4_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 0
+// CHECK-NEXT:  %vld4_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 1
+// CHECK-NEXT:  %vld4_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 2
+// CHECK-NEXT:  %vld4_v.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx2, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx4, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i16> %vld4_v.fca.3.extract, <4 x i16>* %__ret.sroa.6.0..sroa_idx6, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.3.extract, <8 x i16>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x i32> %7, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x i32> %b.sroa.9.48.vec.expand, <4 x i32> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %8, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, <4 x i32> %b.sroa.9.56.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.3.extract, <4 x i32>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.3.extract, <8 x i16>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x i32> %7, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x i32> %b.sroa.9.48.vec.expand, <4 x i32> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %8, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, <4 x i32> %b.sroa.9.56.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.0.extract, <4 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.1.extract, <4 x i32>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.2.extract, <4 x i32>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i32> %vld4q_lane_v.fca.3.extract, <4 x i32>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x half> %4, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x half> %5, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x half> %b.sroa.6.32.vec.expand, <8 x half> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x half> %6, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x half> %7, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x half> %b.sroa.9.48.vec.expand, <8 x half> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4lane.v8f16.p0i8(i8* %8, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, <8 x half> %b.sroa.6.40.vecblend, <8 x half> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x half> %vld4q_lane_v.fca.0.extract, <8 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x half> %vld4q_lane_v.fca.1.extract, <8 x half>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x half> %vld4q_lane_v.fca.2.extract, <8 x half>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x half> %vld4q_lane_v.fca.3.extract, <8 x half>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x float> %b.sroa.6.32.vec.expand, <4 x float> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x float> %6, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x float> %7, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x float> %b.sroa.9.48.vec.expand, <4 x float> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %8, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, <4 x float> %b.sroa.6.40.vecblend, <4 x float> %b.sroa.9.56.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x float> %vld4q_lane_v.fca.0.extract, <4 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x float> %vld4q_lane_v.fca.1.extract, <4 x float>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x float> %vld4q_lane_v.fca.2.extract, <4 x float>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x float> %vld4q_lane_v.fca.3.extract, <4 x float>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4q_lane_v = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  %vld4q_lane_v.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 0
+// CHECK-NEXT:  %vld4q_lane_v.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 1
+// CHECK-NEXT:  %vld4q_lane_v.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 2
+// CHECK-NEXT:  %vld4q_lane_v.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4q_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.0.extract, <8 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.1.extract, <8 x i16>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.2.extract, <8 x i16>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i16> %vld4q_lane_v.fca.3.extract, <8 x i16>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %vld4_lane_v = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 7, i32 1)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx7 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx10 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx13 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx13, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx16 = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.3.extract, <8 x i8>* %__ret.sroa.6.0..sroa_idx16, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.3.extract, <4 x i16>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %4, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 1, i32 4)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.3.extract, <2 x i32>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %vld4_lane_v = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 7, i32 1)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx7 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx10 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx13 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx13, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx16 = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.3.extract, <8 x i8>* %__ret.sroa.6.0..sroa_idx16, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.3.extract, <4 x i16>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %4, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 1, i32 4)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.0.extract, <2 x i32>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.1.extract, <2 x i32>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.2.extract, <2 x i32>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <2 x i32> %vld4_lane_v.fca.3.extract, <2 x i32>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %4 = bitcast half* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4lane.v4f16.p0i8(i8* %4, <4 x half> %0, <4 x half> %1, <4 x half> %2, <4 x half> %3, i32 3, i32 2)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x half> %vld4_lane_v.fca.0.extract, <4 x half>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x half> %vld4_lane_v.fca.1.extract, <4 x half>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x half> %vld4_lane_v.fca.2.extract, <4 x half>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x half> %vld4_lane_v.fca.3.extract, <4 x half>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %4 = bitcast float* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %4, <2 x float> %0, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <2 x float> %vld4_lane_v.fca.0.extract, <2 x float>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <2 x float> %vld4_lane_v.fca.1.extract, <2 x float>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <2 x float> %vld4_lane_v.fca.2.extract, <2 x float>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <2 x float> %vld4_lane_v.fca.3.extract, <2 x float>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %vld4_lane_v = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 7, i32 1)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx7 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.0.extract, <8 x i8>* %__ret.sroa.0.0..sroa_idx7, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx10 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.1.extract, <8 x i8>* %__ret.sroa.4.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx13 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.2.extract, <8 x i8>* %__ret.sroa.5.0..sroa_idx13, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx16 = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <8 x i8> %vld4_lane_v.fca.3.extract, <8 x i8>* %__ret.sroa.6.0..sroa_idx16, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vld4_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  %vld4_lane_v = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+// CHECK-NEXT:  %vld4_lane_v.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 0
+// CHECK-NEXT:  %vld4_lane_v.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 1
+// CHECK-NEXT:  %vld4_lane_v.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 2
+// CHECK-NEXT:  %vld4_lane_v.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4_lane_v, 3
+// CHECK-NEXT:  %__ret.sroa.0.0..sroa_idx = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.0.extract, <4 x i16>* %__ret.sroa.0.0..sroa_idx, align 8
+// CHECK-NEXT:  %__ret.sroa.4.0..sroa_idx8 = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.1.extract, <4 x i16>* %__ret.sroa.4.0..sroa_idx8, align 8
+// CHECK-NEXT:  %__ret.sroa.5.0..sroa_idx10 = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 2
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.2.extract, <4 x i16>* %__ret.sroa.5.0..sroa_idx10, align 8
+// CHECK-NEXT:  %__ret.sroa.6.0..sroa_idx12 = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* %agg.result, i32 0, i32 0, i32 3
+// CHECK-NEXT:  store <4 x i16> %vld4_lane_v.fca.3.extract, <4 x i16>* %__ret.sroa.6.0..sroa_idx12, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vmax_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v.i = tail call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vmax_v.i
+// CHECK-LABEL: test_vmax_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v2.i = tail call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vmax_v2.i
+// CHECK-LABEL: test_vmax_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v2.i = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vmax_v2.i
+// CHECK-LABEL: test_vmax_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v.i = tail call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vmax_v.i
+// CHECK-LABEL: test_vmax_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v2.i = tail call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vmax_v2.i
+// CHECK-LABEL: test_vmax_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v2.i = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vmax_v2.i
+// CHECK-LABEL: test_vmax_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmax_v2.i = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vmax_v2.i
+// CHECK-LABEL: test_vmaxq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v.i = tail call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vmaxq_v.i
+// CHECK-LABEL: test_vmaxq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v2.i = tail call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vmaxq_v2.i
+// CHECK-LABEL: test_vmaxq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v2.i = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vmaxq_v2.i
+// CHECK-LABEL: test_vmaxq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v.i = tail call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vmaxq_v.i
+// CHECK-LABEL: test_vmaxq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v2.i = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vmaxq_v2.i
+// CHECK-LABEL: test_vmaxq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v2.i = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vmaxq_v2.i
+// CHECK-LABEL: test_vmaxq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmaxq_v2.i = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x float> %vmaxq_v2.i
+// CHECK-LABEL: test_vmin_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v.i = tail call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vmin_v.i
+// CHECK-LABEL: test_vmin_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v2.i = tail call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vmin_v2.i
+// CHECK-LABEL: test_vmin_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v2.i = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vmin_v2.i
+// CHECK-LABEL: test_vmin_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v.i = tail call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vmin_v.i
+// CHECK-LABEL: test_vmin_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v2.i = tail call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vmin_v2.i
+// CHECK-LABEL: test_vmin_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v2.i = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vmin_v2.i
+// CHECK-LABEL: test_vmin_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmin_v2.i = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vmin_v2.i
+// CHECK-LABEL: test_vminq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v.i = tail call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vminq_v.i
+// CHECK-LABEL: test_vminq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v2.i = tail call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vminq_v2.i
+// CHECK-LABEL: test_vminq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v2.i = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vminq_v2.i
+// CHECK-LABEL: test_vminq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v.i = tail call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vminq_v.i
+// CHECK-LABEL: test_vminq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v2.i = tail call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vminq_v2.i
+// CHECK-LABEL: test_vminq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v2.i = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vminq_v2.i
+// CHECK-LABEL: test_vminq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vminq_v2.i = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x float> %vminq_v2.i
+// CHECK-LABEL: test_vmla_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i8> %c, %b
+// CHECK-NEXT:  %add.i = add <8 x i8> %mul.i, %a
+// CHECK-NEXT:  ret <8 x i8> %add.i
+// CHECK-LABEL: test_vmla_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %c, %b
+// CHECK-NEXT:  %add.i = add <4 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vmla_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %c, %b
+// CHECK-NEXT:  %add.i = add <2 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vmla_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = fmul <2 x float> %b, %c
+// CHECK-NEXT:  %add.i = fadd <2 x float> %mul.i, %a
+// CHECK-NEXT:  ret <2 x float> %add.i
+// CHECK-LABEL: test_vmla_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i8> %c, %b
+// CHECK-NEXT:  %add.i = add <8 x i8> %mul.i, %a
+// CHECK-NEXT:  ret <8 x i8> %add.i
+// CHECK-LABEL: test_vmla_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %c, %b
+// CHECK-NEXT:  %add.i = add <4 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vmla_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %c, %b
+// CHECK-NEXT:  %add.i = add <2 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vmlaq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <16 x i8> %c, %b
+// CHECK-NEXT:  %add.i = add <16 x i8> %mul.i, %a
+// CHECK-NEXT:  ret <16 x i8> %add.i
+// CHECK-LABEL: test_vmlaq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %c, %b
+// CHECK-NEXT:  %add.i = add <8 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vmlaq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %c, %b
+// CHECK-NEXT:  %add.i = add <4 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlaq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = fmul <4 x float> %b, %c
+// CHECK-NEXT:  %add.i = fadd <4 x float> %mul.i, %a
+// CHECK-NEXT:  ret <4 x float> %add.i
+// CHECK-LABEL: test_vmlaq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <16 x i8> %c, %b
+// CHECK-NEXT:  %add.i = add <16 x i8> %mul.i, %a
+// CHECK-NEXT:  ret <16 x i8> %add.i
+// CHECK-LABEL: test_vmlaq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %c, %b
+// CHECK-NEXT:  %add.i = add <8 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vmlaq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %c, %b
+// CHECK-NEXT:  %add.i = add <4 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlal_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %add.i = add <8 x i16> %vmull.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vmlal_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlal_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vmlal_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %add.i = add <8 x i16> %vmull.i.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vmlal_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlal_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vmlal_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+// CHECK-NEXT:  %add = add <4 x i32> %vmull2.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add
+// CHECK-LABEL: test_vmlal_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+// CHECK-NEXT:  %add = add <2 x i64> %vmull2.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add
+// CHECK-LABEL: test_vmlal_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+// CHECK-NEXT:  %add = add <4 x i32> %vmull2.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add
+// CHECK-LABEL: test_vmlal_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+// CHECK-NEXT:  %add = add <2 x i64> %vmull2.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add
+// CHECK-LABEL: test_vmlal_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlal_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vmlal_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  %add.i = add <4 x i32> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlal_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  %add.i = add <2 x i64> %vmull2.i.i, %a
+// CHECK-NEXT:  ret <2 x i64> %add.i
+// CHECK-LABEL: test_vmla_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <4 x i16> %shuffle, %b
+// CHECK-NEXT:  %add = add <4 x i16> %mul, %a
+// CHECK-NEXT:  ret <4 x i16> %add
+// CHECK-LABEL: test_vmla_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <2 x i32> %shuffle, %b
+// CHECK-NEXT:  %add = add <2 x i32> %mul, %a
+// CHECK-NEXT:  ret <2 x i32> %add
+// CHECK-LABEL: test_vmla_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <4 x i16> %shuffle, %b
+// CHECK-NEXT:  %add = add <4 x i16> %mul, %a
+// CHECK-NEXT:  ret <4 x i16> %add
+// CHECK-LABEL: test_vmla_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <2 x i32> %shuffle, %b
+// CHECK-NEXT:  %add = add <2 x i32> %mul, %a
+// CHECK-NEXT:  ret <2 x i32> %add
+// CHECK-LABEL: test_vmla_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %c, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = fmul <2 x float> %shuffle, %b
+// CHECK-NEXT:  %add = fadd <2 x float> %mul, %a
+// CHECK-NEXT:  ret <2 x float> %add
+// CHECK-LABEL: test_vmlaq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <8 x i16> %shuffle, %b
+// CHECK-NEXT:  %add = add <8 x i16> %mul, %a
+// CHECK-NEXT:  ret <8 x i16> %add
+// CHECK-LABEL: test_vmlaq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <4 x i32> %shuffle, %b
+// CHECK-NEXT:  %add = add <4 x i32> %mul, %a
+// CHECK-NEXT:  ret <4 x i32> %add
+// CHECK-LABEL: test_vmlaq_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <8 x i16> %shuffle, %b
+// CHECK-NEXT:  %add = add <8 x i16> %mul, %a
+// CHECK-NEXT:  ret <8 x i16> %add
+// CHECK-LABEL: test_vmlaq_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <4 x i32> %shuffle, %b
+// CHECK-NEXT:  %add = add <4 x i32> %mul, %a
+// CHECK-NEXT:  ret <4 x i32> %add
+// CHECK-LABEL: test_vmlaq_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %c, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = fmul <4 x float> %shuffle, %b
+// CHECK-NEXT:  %add = fadd <4 x float> %mul, %a
+// CHECK-NEXT:  ret <4 x float> %add
+// CHECK-LABEL: test_vmla_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %vecinit3.i, %b
+// CHECK-NEXT:  %add.i = add <4 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vmla_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %vecinit1.i, %b
+// CHECK-NEXT:  %add.i = add <2 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vmla_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %vecinit3.i, %b
+// CHECK-NEXT:  %add.i = add <4 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i16> %add.i
+// CHECK-LABEL: test_vmla_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %vecinit1.i, %b
+// CHECK-NEXT:  %add.i = add <2 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <2 x i32> %add.i
+// CHECK-LABEL: test_vmla_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x float> undef, float %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x float> %vecinit.i, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = fmul <2 x float> %vecinit1.i, %b
+// CHECK-NEXT:  %add.i = fadd <2 x float> %mul.i, %a
+// CHECK-NEXT:  ret <2 x float> %add.i
+// CHECK-LABEL: test_vmlaq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %vecinit7.i, %b
+// CHECK-NEXT:  %add.i = add <8 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vmlaq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %vecinit3.i, %b
+// CHECK-NEXT:  %add.i = add <4 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlaq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %vecinit7.i, %b
+// CHECK-NEXT:  %add.i = add <8 x i16> %mul.i, %a
+// CHECK-NEXT:  ret <8 x i16> %add.i
+// CHECK-LABEL: test_vmlaq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %vecinit3.i, %b
+// CHECK-NEXT:  %add.i = add <4 x i32> %mul.i, %a
+// CHECK-NEXT:  ret <4 x i32> %add.i
+// CHECK-LABEL: test_vmlaq_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x float> undef, float %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = fmul <4 x float> %vecinit3.i, %b
+// CHECK-NEXT:  %add.i = fadd <4 x float> %mul.i, %a
+// CHECK-NEXT:  ret <4 x float> %add.i
+// CHECK-LABEL: test_vmls_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i8> %c, %b
+// CHECK-NEXT:  %sub.i = sub <8 x i8> %a, %mul.i
+// CHECK-NEXT:  ret <8 x i8> %sub.i
+// CHECK-LABEL: test_vmls_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %c, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vmls_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %c, %b
+// CHECK-NEXT:  %sub.i = sub <2 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vmls_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = fmul <2 x float> %b, %c
+// CHECK-NEXT:  %sub.i = fsub <2 x float> %a, %mul.i
+// CHECK-NEXT:  ret <2 x float> %sub.i
+// CHECK-LABEL: test_vmls_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i8> %c, %b
+// CHECK-NEXT:  %sub.i = sub <8 x i8> %a, %mul.i
+// CHECK-NEXT:  ret <8 x i8> %sub.i
+// CHECK-LABEL: test_vmls_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %c, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vmls_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %c, %b
+// CHECK-NEXT:  %sub.i = sub <2 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vmlsq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <16 x i8> %c, %b
+// CHECK-NEXT:  %sub.i = sub <16 x i8> %a, %mul.i
+// CHECK-NEXT:  ret <16 x i8> %sub.i
+// CHECK-LABEL: test_vmlsq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %c, %b
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vmlsq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %c, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = fmul <4 x float> %b, %c
+// CHECK-NEXT:  %sub.i = fsub <4 x float> %a, %mul.i
+// CHECK-NEXT:  ret <4 x float> %sub.i
+// CHECK-LABEL: test_vmlsq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <16 x i8> %c, %b
+// CHECK-NEXT:  %sub.i = sub <16 x i8> %a, %mul.i
+// CHECK-NEXT:  ret <16 x i8> %sub.i
+// CHECK-LABEL: test_vmlsq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %c, %b
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vmlsq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %c, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %vmull.i.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vmlsl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vmlsl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %vmull.i.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vmlsl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vmlsl_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+// CHECK-NEXT:  %sub = sub <4 x i32> %a, %vmull2.i
+// CHECK-NEXT:  ret <4 x i32> %sub
+// CHECK-LABEL: test_vmlsl_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+// CHECK-NEXT:  %sub = sub <2 x i64> %a, %vmull2.i
+// CHECK-NEXT:  ret <2 x i64> %sub
+// CHECK-LABEL: test_vmlsl_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+// CHECK-NEXT:  %sub = sub <4 x i32> %a, %vmull2.i
+// CHECK-NEXT:  ret <4 x i32> %sub
+// CHECK-LABEL: test_vmlsl_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+// CHECK-NEXT:  %sub = sub <2 x i64> %a, %vmull2.i
+// CHECK-NEXT:  ret <2 x i64> %sub
+// CHECK-LABEL: test_vmlsl_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsl_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vmlsl_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsl_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vmls_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <4 x i16> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <4 x i16> %a, %mul
+// CHECK-NEXT:  ret <4 x i16> %sub
+// CHECK-LABEL: test_vmls_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <2 x i32> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <2 x i32> %a, %mul
+// CHECK-NEXT:  ret <2 x i32> %sub
+// CHECK-LABEL: test_vmls_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <4 x i16> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <4 x i16> %a, %mul
+// CHECK-NEXT:  ret <4 x i16> %sub
+// CHECK-LABEL: test_vmls_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <2 x i32> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <2 x i32> %a, %mul
+// CHECK-NEXT:  ret <2 x i32> %sub
+// CHECK-LABEL: test_vmls_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %c, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = fmul <2 x float> %shuffle, %b
+// CHECK-NEXT:  %sub = fsub <2 x float> %a, %mul
+// CHECK-NEXT:  ret <2 x float> %sub
+// CHECK-LABEL: test_vmlsq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <8 x i16> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <8 x i16> %a, %mul
+// CHECK-NEXT:  ret <8 x i16> %sub
+// CHECK-LABEL: test_vmlsq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <4 x i32> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <4 x i32> %a, %mul
+// CHECK-NEXT:  ret <4 x i32> %sub
+// CHECK-LABEL: test_vmlsq_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <8 x i16> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <8 x i16> %a, %mul
+// CHECK-NEXT:  ret <8 x i16> %sub
+// CHECK-LABEL: test_vmlsq_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <4 x i32> %shuffle, %b
+// CHECK-NEXT:  %sub = sub <4 x i32> %a, %mul
+// CHECK-NEXT:  ret <4 x i32> %sub
+// CHECK-LABEL: test_vmlsq_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %c, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = fmul <4 x float> %shuffle, %b
+// CHECK-NEXT:  %sub = fsub <4 x float> %a, %mul
+// CHECK-NEXT:  ret <4 x float> %sub
+// CHECK-LABEL: test_vmls_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %vecinit3.i, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vmls_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %vecinit1.i, %b
+// CHECK-NEXT:  %sub.i = sub <2 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vmls_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %vecinit3.i, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vmls_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %vecinit1.i, %b
+// CHECK-NEXT:  %sub.i = sub <2 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vmls_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x float> undef, float %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x float> %vecinit.i, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = fmul <2 x float> %vecinit1.i, %b
+// CHECK-NEXT:  %sub.i = fsub <2 x float> %a, %mul.i
+// CHECK-NEXT:  ret <2 x float> %sub.i
+// CHECK-LABEL: test_vmlsq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %vecinit7.i, %b
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vmlsq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %vecinit3.i, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %vecinit7.i, %b
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %mul.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vmlsq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %vecinit3.i, %b
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %mul.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vmlsq_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x float> undef, float %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = fmul <4 x float> %vecinit3.i, %b
+// CHECK-NEXT:  %sub.i = fsub <4 x float> %a, %mul.i
+// CHECK-NEXT:  ret <4 x float> %sub.i
+// CHECK-LABEL: test_vmovl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i = sext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vmovl.i
+// CHECK-LABEL: test_vmovl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i = sext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vmovl.i
+// CHECK-LABEL: test_vmovl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i = sext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %vmovl.i
+// CHECK-LABEL: test_vmovl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i = zext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vmovl.i
+// CHECK-LABEL: test_vmovl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i = zext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vmovl.i
+// CHECK-LABEL: test_vmovl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i = zext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %vmovl.i
+// CHECK-LABEL: test_vmovn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vmovn.i
+// CHECK-LABEL: test_vmovn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vmovn.i
+// CHECK-LABEL: test_vmovn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vmovn.i
+// CHECK-LABEL: test_vmovn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vmovn.i
+// CHECK-LABEL: test_vmovn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vmovn.i
+// CHECK-LABEL: test_vmovn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vmovn.i
+// CHECK-LABEL: test_vmov_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %vecinit7.i
+// CHECK-LABEL: test_vmov_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %vecinit3.i
+// CHECK-LABEL: test_vmov_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i32> %vecinit1.i
+// CHECK-LABEL: test_vmov_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %vecinit7.i
+// CHECK-LABEL: test_vmov_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %vecinit3.i
+// CHECK-LABEL: test_vmov_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i32> %vecinit1.i
+// CHECK-LABEL: test_vmov_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i8> %vecinit7.i
+// CHECK-LABEL: test_vmov_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i16> %vecinit3.i
+// CHECK-LABEL: test_vmov_n_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2, !tbaa !3
+// CHECK-NEXT:  %vecinit = insertelement <4 x half> undef, half %0, i32 0
+// CHECK-NEXT:  %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x half> %vecinit3
+// CHECK-LABEL: test_vmov_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x float> undef, float %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x float> %vecinit.i, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x float> %vecinit1.i
+// CHECK-LABEL: test_vmovq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %vecinit15.i
+// CHECK-LABEL: test_vmovq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %vecinit7.i
+// CHECK-LABEL: test_vmovq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i32> %vecinit3.i
+// CHECK-LABEL: test_vmovq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %vecinit15.i
+// CHECK-LABEL: test_vmovq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %vecinit7.i
+// CHECK-LABEL: test_vmovq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x i32> %vecinit3.i
+// CHECK-LABEL: test_vmovq_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK-NEXT:  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:  ret <16 x i8> %vecinit15.i
+// CHECK-LABEL: test_vmovq_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x i16> %vecinit7.i
+// CHECK-LABEL: test_vmovq_n_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = load half, half* %a, align 2, !tbaa !3
+// CHECK-NEXT:  %vecinit = insertelement <8 x half> undef, half %0, i32 0
+// CHECK-NEXT:  %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  ret <8 x half> %vecinit7
+// CHECK-LABEL: test_vmovq_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x float> undef, float %a, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  ret <4 x float> %vecinit3.i
+// CHECK-LABEL: test_vmov_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %add.i = shl <1 x i64> %vecinit.i, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vmov_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %add.i = shl <1 x i64> %vecinit.i, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %add.i
+// CHECK-LABEL: test_vmovq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %vecinit1.i
+// CHECK-LABEL: test_vmovq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  ret <2 x i64> %vecinit1.i
+// CHECK-LABEL: test_vmul_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %mul.i
+// CHECK-LABEL: test_vmul_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %mul.i
+// CHECK-LABEL: test_vmul_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %mul.i
+// CHECK-LABEL: test_vmul_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = fmul <2 x float> %a, %b
+// CHECK-NEXT:  ret <2 x float> %mul.i
+// CHECK-LABEL: test_vmul_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %mul.i
+// CHECK-LABEL: test_vmul_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %mul.i
+// CHECK-LABEL: test_vmul_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %mul.i
+// CHECK-LABEL: test_vmulq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %mul.i
+// CHECK-LABEL: test_vmulq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %mul.i
+// CHECK-LABEL: test_vmulq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %mul.i
+// CHECK-LABEL: test_vmulq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = fmul <4 x float> %a, %b
+// CHECK-NEXT:  ret <4 x float> %mul.i
+// CHECK-LABEL: test_vmulq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %mul.i
+// CHECK-LABEL: test_vmulq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %mul.i
+// CHECK-LABEL: test_vmulq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %mul.i
+// CHECK-LABEL: test_vmull_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i16> %vmull.i
+// CHECK-LABEL: test_vmull_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i32> %vmull2.i
+// CHECK-LABEL: test_vmull_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i64> %vmull2.i
+// CHECK-LABEL: test_vmull_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i16> %vmull.i
+// CHECK-LABEL: test_vmull_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i32> %vmull2.i
+// CHECK-LABEL: test_vmull_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i64> %vmull2.i
+// CHECK-LABEL: test_vmull_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i16> %vmull.i
+// CHECK-LABEL: test_vmull_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+// CHECK-NEXT:  ret <4 x i32> %vmull2.i
+// CHECK-LABEL: test_vmull_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+// CHECK-NEXT:  ret <2 x i64> %vmull2.i
+// CHECK-LABEL: test_vmull_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+// CHECK-NEXT:  ret <4 x i32> %vmull2.i
+// CHECK-LABEL: test_vmull_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+// CHECK-NEXT:  ret <2 x i64> %vmull2.i
+// CHECK-LABEL: test_vmull_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull5.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i32> %vmull5.i
+// CHECK-LABEL: test_vmull_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull3.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  ret <2 x i64> %vmull3.i
+// CHECK-LABEL: test_vmull_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull5.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i32> %vmull5.i
+// CHECK-LABEL: test_vmull_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vmull3.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  ret <2 x i64> %vmull3.i
+// CHECK-LABEL: test_vmul_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmul_v.i = tail call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vmul_v.i
+// CHECK-LABEL: test_vmulq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmulq_v.i = tail call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vmulq_v.i
+// CHECK-LABEL: test_vmul_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <4 x i16> %shuffle, %a
+// CHECK-NEXT:  ret <4 x i16> %mul
+// CHECK-LABEL: test_vmul_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <2 x i32> %shuffle, %a
+// CHECK-NEXT:  ret <2 x i32> %mul
+// CHECK-LABEL: test_vmul_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = fmul <2 x float> %shuffle, %a
+// CHECK-NEXT:  ret <2 x float> %mul
+// CHECK-LABEL: test_vmul_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <4 x i16> %shuffle, %a
+// CHECK-NEXT:  ret <4 x i16> %mul
+// CHECK-LABEL: test_vmul_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <2 x i32> %shuffle, %a
+// CHECK-NEXT:  ret <2 x i32> %mul
+// CHECK-LABEL: test_vmulq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <8 x i16> %shuffle, %a
+// CHECK-NEXT:  ret <8 x i16> %mul
+// CHECK-LABEL: test_vmulq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <4 x i32> %shuffle, %a
+// CHECK-NEXT:  ret <4 x i32> %mul
+// CHECK-LABEL: test_vmulq_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x float> %b, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = fmul <4 x float> %shuffle, %a
+// CHECK-NEXT:  ret <4 x float> %mul
+// CHECK-LABEL: test_vmulq_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %mul = mul <8 x i16> %shuffle, %a
+// CHECK-NEXT:  ret <8 x i16> %mul
+// CHECK-LABEL: test_vmulq_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %mul = mul <4 x i32> %shuffle, %a
+// CHECK-NEXT:  ret <4 x i32> %mul
+// CHECK-LABEL: test_vmul_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %vecinit3.i, %a
+// CHECK-NEXT:  ret <4 x i16> %mul.i
+// CHECK-LABEL: test_vmul_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %vecinit1.i, %a
+// CHECK-NEXT:  ret <2 x i32> %mul.i
+// CHECK-LABEL: test_vmul_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x float> %vecinit.i, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = fmul <2 x float> %vecinit1.i, %a
+// CHECK-NEXT:  ret <2 x float> %mul.i
+// CHECK-LABEL: test_vmul_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i16> %vecinit3.i, %a
+// CHECK-NEXT:  ret <4 x i16> %mul.i
+// CHECK-LABEL: test_vmul_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <2 x i32> %vecinit1.i, %a
+// CHECK-NEXT:  ret <2 x i32> %mul.i
+// CHECK-LABEL: test_vmulq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %vecinit7.i, %a
+// CHECK-NEXT:  ret <8 x i16> %mul.i
+// CHECK-LABEL: test_vmulq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %vecinit3.i, %a
+// CHECK-NEXT:  ret <4 x i32> %mul.i
+// CHECK-LABEL: test_vmulq_n_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = fmul <4 x float> %vecinit3.i, %a
+// CHECK-NEXT:  ret <4 x float> %mul.i
+// CHECK-LABEL: test_vmulq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <8 x i16> %vecinit7.i, %a
+// CHECK-NEXT:  ret <8 x i16> %mul.i
+// CHECK-LABEL: test_vmulq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %mul.i = mul <4 x i32> %vecinit3.i, %a
+// CHECK-NEXT:  ret <4 x i32> %mul.i
+// CHECK-LABEL: test_vmvn_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  ret <8 x i8> %neg.i
+// CHECK-LABEL: test_vmvn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  ret <4 x i16> %neg.i
+// CHECK-LABEL: test_vmvn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK-NEXT:  ret <2 x i32> %neg.i
+// CHECK-LABEL: test_vmvn_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  ret <8 x i8> %neg.i
+// CHECK-LABEL: test_vmvn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  ret <4 x i16> %neg.i
+// CHECK-LABEL: test_vmvn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK-NEXT:  ret <2 x i32> %neg.i
+// CHECK-LABEL: test_vmvn_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  ret <8 x i8> %neg.i
+// CHECK-LABEL: test_vmvnq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  ret <16 x i8> %neg.i
+// CHECK-LABEL: test_vmvnq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  ret <8 x i16> %neg.i
+// CHECK-LABEL: test_vmvnq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:  ret <4 x i32> %neg.i
+// CHECK-LABEL: test_vmvnq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  ret <16 x i8> %neg.i
+// CHECK-LABEL: test_vmvnq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  ret <8 x i16> %neg.i
+// CHECK-LABEL: test_vmvnq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:  ret <4 x i32> %neg.i
+// CHECK-LABEL: test_vmvnq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  ret <16 x i8> %neg.i
+// CHECK-LABEL: test_vneg_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <8 x i8> zeroinitializer, %a
+// CHECK-NEXT:  ret <8 x i8> %sub.i
+// CHECK-LABEL: test_vneg_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <4 x i16> zeroinitializer, %a
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vneg_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <2 x i32> zeroinitializer, %a
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vneg_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK-NEXT:  ret <2 x float> %sub.i
+// CHECK-LABEL: test_vnegq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <16 x i8> zeroinitializer, %a
+// CHECK-NEXT:  ret <16 x i8> %sub.i
+// CHECK-LABEL: test_vnegq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <8 x i16> zeroinitializer, %a
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vnegq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <4 x i32> zeroinitializer, %a
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vnegq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK-NEXT:  ret <4 x float> %sub.i
+// CHECK-LABEL: test_vorn_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %or.i = or <8 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i8> %or.i
+// CHECK-LABEL: test_vorn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %or.i = or <4 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i16> %or.i
+// CHECK-LABEL: test_vorn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK-NEXT:  %or.i = or <2 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i32> %or.i
+// CHECK-LABEL: test_vorn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <1 x i64> %b, <i64 -1>
+// CHECK-NEXT:  %or.i = or <1 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <1 x i64> %or.i
+// CHECK-LABEL: test_vorn_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %or.i = or <8 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i8> %or.i
+// CHECK-LABEL: test_vorn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %or.i = or <4 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i16> %or.i
+// CHECK-LABEL: test_vorn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK-NEXT:  %or.i = or <2 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i32> %or.i
+// CHECK-LABEL: test_vorn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <1 x i64> %b, <i64 -1>
+// CHECK-NEXT:  %or.i = or <1 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <1 x i64> %or.i
+// CHECK-LABEL: test_vornq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %or.i = or <16 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <16 x i8> %or.i
+// CHECK-LABEL: test_vornq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %or.i = or <8 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i16> %or.i
+// CHECK-LABEL: test_vornq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:  %or.i = or <4 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i32> %or.i
+// CHECK-LABEL: test_vornq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK-NEXT:  %or.i = or <2 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i64> %or.i
+// CHECK-LABEL: test_vornq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:  %or.i = or <16 x i8> %neg.i, %a
+// CHECK-NEXT:  ret <16 x i8> %or.i
+// CHECK-LABEL: test_vornq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:  %or.i = or <8 x i16> %neg.i, %a
+// CHECK-NEXT:  ret <8 x i16> %or.i
+// CHECK-LABEL: test_vornq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:  %or.i = or <4 x i32> %neg.i, %a
+// CHECK-NEXT:  ret <4 x i32> %or.i
+// CHECK-LABEL: test_vornq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %neg.i = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK-NEXT:  %or.i = or <2 x i64> %neg.i, %a
+// CHECK-NEXT:  ret <2 x i64> %or.i
+// CHECK-LABEL: test_vorr_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %or.i
+// CHECK-LABEL: test_vorr_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %or.i
+// CHECK-LABEL: test_vorr_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %or.i
+// CHECK-LABEL: test_vorr_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %or.i
+// CHECK-LABEL: test_vorr_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <8 x i8> %b, %a
+// CHECK-NEXT:  ret <8 x i8> %or.i
+// CHECK-LABEL: test_vorr_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <4 x i16> %b, %a
+// CHECK-NEXT:  ret <4 x i16> %or.i
+// CHECK-LABEL: test_vorr_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <2 x i32> %b, %a
+// CHECK-NEXT:  ret <2 x i32> %or.i
+// CHECK-LABEL: test_vorr_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <1 x i64> %b, %a
+// CHECK-NEXT:  ret <1 x i64> %or.i
+// CHECK-LABEL: test_vorrq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %or.i
+// CHECK-LABEL: test_vorrq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %or.i
+// CHECK-LABEL: test_vorrq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %or.i
+// CHECK-LABEL: test_vorrq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %or.i
+// CHECK-LABEL: test_vorrq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <16 x i8> %b, %a
+// CHECK-NEXT:  ret <16 x i8> %or.i
+// CHECK-LABEL: test_vorrq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <8 x i16> %b, %a
+// CHECK-NEXT:  ret <8 x i16> %or.i
+// CHECK-LABEL: test_vorrq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <4 x i32> %b, %a
+// CHECK-NEXT:  ret <4 x i32> %or.i
+// CHECK-LABEL: test_vorrq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %or.i = or <2 x i64> %b, %a
+// CHECK-NEXT:  ret <2 x i64> %or.i
+// CHECK-LABEL: test_vpadal_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadal_v1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpadal_v1.i
+// CHECK-LABEL: test_vpadal_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadal_v2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpadal_v2.i
+// CHECK-LABEL: test_vpadal_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadal_v2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <1 x i64> %vpadal_v2.i
+// CHECK-LABEL: test_vpadal_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadal_v1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpadal_v1.i
+// CHECK-LABEL: test_vpadal_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadal_v2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpadal_v2.i
+// CHECK-LABEL: test_vpadal_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadal_v2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <1 x i64> %vpadal_v2.i
+// CHECK-LABEL: test_vpadalq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadalq_v1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <8 x i16> %vpadalq_v1.i
+// CHECK-LABEL: test_vpadalq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadalq_v2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <4 x i32> %vpadalq_v2.i
+// CHECK-LABEL: test_vpadalq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadalq_v2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <2 x i64> %vpadalq_v2.i
+// CHECK-LABEL: test_vpadalq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadalq_v1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <8 x i16> %vpadalq_v1.i
+// CHECK-LABEL: test_vpadalq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadalq_v2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <4 x i32> %vpadalq_v2.i
+// CHECK-LABEL: test_vpadalq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadalq_v2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <2 x i64> %vpadalq_v2.i
+// CHECK-LABEL: test_vpadd_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v.i = tail call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vpadd_v.i
+// CHECK-LABEL: test_vpadd_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpadd_v2.i
+// CHECK-LABEL: test_vpadd_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpadd_v2.i
+// CHECK-LABEL: test_vpadd_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v.i = tail call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vpadd_v.i
+// CHECK-LABEL: test_vpadd_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpadd_v2.i
+// CHECK-LABEL: test_vpadd_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpadd_v2.i
+// CHECK-LABEL: test_vpadd_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpadd_v2.i = tail call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vpadd_v2.i
+// CHECK-LABEL: test_vpaddl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <4 x i16> %vpaddl.i
+// CHECK-LABEL: test_vpaddl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a)
+// CHECK-NEXT:  ret <2 x i32> %vpaddl1.i
+// CHECK-LABEL: test_vpaddl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <1 x i64> %vpaddl1.i
+// CHECK-LABEL: test_vpaddl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <4 x i16> %vpaddl.i
+// CHECK-LABEL: test_vpaddl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a)
+// CHECK-NEXT:  ret <2 x i32> %vpaddl1.i
+// CHECK-LABEL: test_vpaddl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <1 x i64> %vpaddl1.i
+// CHECK-LABEL: test_vpaddlq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <8 x i16> %vpaddl.i
+// CHECK-LABEL: test_vpaddlq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a)
+// CHECK-NEXT:  ret <4 x i32> %vpaddl1.i
+// CHECK-LABEL: test_vpaddlq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <2 x i64> %vpaddl1.i
+// CHECK-LABEL: test_vpaddlq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <8 x i16> %vpaddl.i
+// CHECK-LABEL: test_vpaddlq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a)
+// CHECK-NEXT:  ret <4 x i32> %vpaddl1.i
+// CHECK-LABEL: test_vpaddlq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <2 x i64> %vpaddl1.i
+// CHECK-LABEL: test_vpmax_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v.i = tail call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vpmax_v.i
+// CHECK-LABEL: test_vpmax_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v2.i = tail call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpmax_v2.i
+// CHECK-LABEL: test_vpmax_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v2.i = tail call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpmax_v2.i
+// CHECK-LABEL: test_vpmax_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v.i = tail call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vpmax_v.i
+// CHECK-LABEL: test_vpmax_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v2.i = tail call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpmax_v2.i
+// CHECK-LABEL: test_vpmax_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v2.i = tail call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpmax_v2.i
+// CHECK-LABEL: test_vpmax_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmax_v2.i = tail call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vpmax_v2.i
+// CHECK-LABEL: test_vpmin_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v.i = tail call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vpmin_v.i
+// CHECK-LABEL: test_vpmin_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v2.i = tail call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpmin_v2.i
+// CHECK-LABEL: test_vpmin_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v2.i = tail call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpmin_v2.i
+// CHECK-LABEL: test_vpmin_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v.i = tail call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vpmin_v.i
+// CHECK-LABEL: test_vpmin_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v2.i = tail call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vpmin_v2.i
+// CHECK-LABEL: test_vpmin_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v2.i = tail call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vpmin_v2.i
+// CHECK-LABEL: test_vpmin_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vpmin_v2.i = tail call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vpmin_v2.i
+// CHECK-LABEL: test_vqabs_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqabs_v.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vqabs_v.i
+// CHECK-LABEL: test_vqabs_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqabs_v1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a)
+// CHECK-NEXT:  ret <4 x i16> %vqabs_v1.i
+// CHECK-LABEL: test_vqabs_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqabs_v1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <2 x i32> %vqabs_v1.i
+// CHECK-LABEL: test_vqabsq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqabsq_v.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vqabsq_v.i
+// CHECK-LABEL: test_vqabsq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqabsq_v1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i16> %vqabsq_v1.i
+// CHECK-LABEL: test_vqabsq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqabsq_v1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i32> %vqabsq_v1.i
+// CHECK-LABEL: test_vqadd_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v.i = tail call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqadd_v.i
+// CHECK-LABEL: test_vqadd_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqadd_v2.i
+// CHECK-LABEL: test_vqadd_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqadd_v2.i
+// CHECK-LABEL: test_vqadd_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v2.i = tail call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqadd_v2.i
+// CHECK-LABEL: test_vqadd_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v.i = tail call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqadd_v.i
+// CHECK-LABEL: test_vqadd_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqadd_v2.i
+// CHECK-LABEL: test_vqadd_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqadd_v2.i
+// CHECK-LABEL: test_vqadd_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqadd_v2.i = tail call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqadd_v2.i
+// CHECK-LABEL: test_vqaddq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v.i = tail call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqaddq_v.i
+// CHECK-LABEL: test_vqaddq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqaddq_v2.i
+// CHECK-LABEL: test_vqaddq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqaddq_v2.i
+// CHECK-LABEL: test_vqaddq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqaddq_v2.i
+// CHECK-LABEL: test_vqaddq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v.i = tail call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqaddq_v.i
+// CHECK-LABEL: test_vqaddq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqaddq_v2.i
+// CHECK-LABEL: test_vqaddq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqaddq_v2.i
+// CHECK-LABEL: test_vqaddq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqaddq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqaddq_v2.i
+// CHECK-LABEL: test_vqdmlal_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %vqdmlal_v3.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmlal_v3.i
+// CHECK-LABEL: test_vqdmlal_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %vqdmlal_v3.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmlal_v3.i
+// CHECK-LABEL: test_vqdmlal_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+// CHECK-NEXT:  %vqdmlal_v3.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmlal_v3.i
+// CHECK-LABEL: test_vqdmlal_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+// CHECK-NEXT:  %vqdmlal_v3.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmlal_v3.i
+// CHECK-LABEL: test_vqdmlal_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmlal5.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  %vqdmlal_v6.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal5.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmlal_v6.i
+// CHECK-LABEL: test_vqdmlal_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmlal3.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  %vqdmlal_v4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal3.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmlal_v4.i
+// CHECK-LABEL: test_vqdmlsl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+// CHECK-NEXT:  %vqdmlsl_v3.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmlsl_v3.i
+// CHECK-LABEL: test_vqdmlsl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+// CHECK-NEXT:  %vqdmlsl_v3.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmlsl_v3.i
+// CHECK-LABEL: test_vqdmlsl_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+// CHECK-NEXT:  %vqdmlsl_v3.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmlsl_v3.i
+// CHECK-LABEL: test_vqdmlsl_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+// CHECK-NEXT:  %vqdmlsl_v3.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmlsl_v3.i
+// CHECK-LABEL: test_vqdmlsl_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmlal5.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  %vqdmlsl_v6.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal5.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmlsl_v6.i
+// CHECK-LABEL: test_vqdmlsl_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmlal3.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  %vqdmlsl_v4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal3.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmlsl_v4.i
+// CHECK-LABEL: test_vqdmulh_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmulh_v2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqdmulh_v2.i
+// CHECK-LABEL: test_vqdmulh_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmulh_v2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqdmulh_v2.i
+// CHECK-LABEL: test_vqdmulhq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmulhq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqdmulhq_v2.i
+// CHECK-LABEL: test_vqdmulhq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmulhq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqdmulhq_v2.i
+// CHECK-LABEL: test_vqdmulh_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqdmulh_v2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+// CHECK-NEXT:  ret <4 x i16> %vqdmulh_v2.i
+// CHECK-LABEL: test_vqdmulh_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vqdmulh_v2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+// CHECK-NEXT:  ret <2 x i32> %vqdmulh_v2.i
+// CHECK-LABEL: test_vqdmulhq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqdmulhq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+// CHECK-NEXT:  ret <8 x i16> %vqdmulhq_v2.i
+// CHECK-LABEL: test_vqdmulhq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %vqdmulhq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+// CHECK-NEXT:  ret <4 x i32> %vqdmulhq_v2.i
+// CHECK-LABEL: test_vqdmulh_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmulh_v5.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i16> %vqdmulh_v5.i
+// CHECK-LABEL: test_vqdmulh_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmulh_v3.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  ret <2 x i32> %vqdmulh_v3.i
+// CHECK-LABEL: test_vqdmulhq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmulhq_v9.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %vecinit7.i)
+// CHECK-NEXT:  ret <8 x i16> %vqdmulhq_v9.i
+// CHECK-LABEL: test_vqdmulhq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmulhq_v5.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmulhq_v5.i
+// CHECK-LABEL: test_vqdmull_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmull_v2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqdmull_v2.i
+// CHECK-LABEL: test_vqdmull_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqdmull_v2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqdmull_v2.i
+// CHECK-LABEL: test_vqdmull_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqdmull_v2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+// CHECK-NEXT:  ret <4 x i32> %vqdmull_v2.i
+// CHECK-LABEL: test_vqdmull_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vqdmull_v2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+// CHECK-NEXT:  ret <2 x i64> %vqdmull_v2.i
+// CHECK-LABEL: test_vqdmull_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmull_v5.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i32> %vqdmull_v5.i
+// CHECK-LABEL: test_vqdmull_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vqdmull_v3.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  ret <2 x i64> %vqdmull_v3.i
+// CHECK-LABEL: test_vqmovn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovn_v1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i8> %vqmovn_v1.i
+// CHECK-LABEL: test_vqmovn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovn_v1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i16> %vqmovn_v1.i
+// CHECK-LABEL: test_vqmovn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovn_v1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a)
+// CHECK-NEXT:  ret <2 x i32> %vqmovn_v1.i
+// CHECK-LABEL: test_vqmovn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovn_v1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i8> %vqmovn_v1.i
+// CHECK-LABEL: test_vqmovn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovn_v1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i16> %vqmovn_v1.i
+// CHECK-LABEL: test_vqmovn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovn_v1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a)
+// CHECK-NEXT:  ret <2 x i32> %vqmovn_v1.i
+// CHECK-LABEL: test_vqmovun_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovun_v1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i8> %vqmovun_v1.i
+// CHECK-LABEL: test_vqmovun_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovun_v1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i16> %vqmovun_v1.i
+// CHECK-LABEL: test_vqmovun_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqmovun_v1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a)
+// CHECK-NEXT:  ret <2 x i32> %vqmovun_v1.i
+// CHECK-LABEL: test_vqneg_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqneg_v.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a)
+// CHECK-NEXT:  ret <8 x i8> %vqneg_v.i
+// CHECK-LABEL: test_vqneg_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqneg_v1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a)
+// CHECK-NEXT:  ret <4 x i16> %vqneg_v1.i
+// CHECK-LABEL: test_vqneg_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqneg_v1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <2 x i32> %vqneg_v1.i
+// CHECK-LABEL: test_vqnegq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqnegq_v.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a)
+// CHECK-NEXT:  ret <16 x i8> %vqnegq_v.i
+// CHECK-LABEL: test_vqnegq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqnegq_v1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a)
+// CHECK-NEXT:  ret <8 x i16> %vqnegq_v1.i
+// CHECK-LABEL: test_vqnegq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqnegq_v1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i32> %vqnegq_v1.i
+// CHECK-LABEL: test_vqrdmulh_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrdmulh_v2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqrdmulh_v2.i
+// CHECK-LABEL: test_vqrdmulh_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrdmulh_v2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqrdmulh_v2.i
+// CHECK-LABEL: test_vqrdmulhq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrdmulhq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqrdmulhq_v2.i
+// CHECK-LABEL: test_vqrdmulhq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrdmulhq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqrdmulhq_v2.i
+// CHECK-LABEL: test_vqrdmulh_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqrdmulh_v2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+// CHECK-NEXT:  ret <4 x i16> %vqrdmulh_v2.i
+// CHECK-LABEL: test_vqrdmulh_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:  %vqrdmulh_v2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+// CHECK-NEXT:  ret <2 x i32> %vqrdmulh_v2.i
+// CHECK-LABEL: test_vqrdmulhq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <4 x i16> %b, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:  %vqrdmulhq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+// CHECK-NEXT:  ret <8 x i16> %vqrdmulhq_v2.i
+// CHECK-LABEL: test_vqrdmulhq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle = shufflevector <2 x i32> %b, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %vqrdmulhq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+// CHECK-NEXT:  ret <4 x i32> %vqrdmulhq_v2.i
+// CHECK-LABEL: test_vqrdmulh_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i16> %vecinit.i, <4 x i16> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqrdmulh_v5.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i16> %vqrdmulh_v5.i
+// CHECK-LABEL: test_vqrdmulh_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit1.i = shufflevector <2 x i32> %vecinit.i, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:  %vqrdmulh_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %vecinit1.i)
+// CHECK-NEXT:  ret <2 x i32> %vqrdmulh_v3.i
+// CHECK-LABEL: test_vqrdmulhq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK-NEXT:  %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:  %vqrdmulhq_v9.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %vecinit7.i)
+// CHECK-NEXT:  ret <8 x i16> %vqrdmulhq_v9.i
+// CHECK-LABEL: test_vqrdmulhq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vecinit.i = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK-NEXT:  %vecinit3.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:  %vqrdmulhq_v5.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %vecinit3.i)
+// CHECK-NEXT:  ret <4 x i32> %vqrdmulhq_v5.i
+// CHECK-LABEL: test_vqrshl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v.i = tail call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqrshl_v.i
+// CHECK-LABEL: test_vqrshl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqrshl_v2.i
+// CHECK-LABEL: test_vqrshl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqrshl_v2.i
+// CHECK-LABEL: test_vqrshl_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqrshl_v2.i
+// CHECK-LABEL: test_vqrshl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v.i = tail call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqrshl_v.i
+// CHECK-LABEL: test_vqrshl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqrshl_v2.i
+// CHECK-LABEL: test_vqrshl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqrshl_v2.i
+// CHECK-LABEL: test_vqrshl_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqrshl_v2.i
+// CHECK-LABEL: test_vqrshlq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqrshlq_v.i
+// CHECK-LABEL: test_vqrshlq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqrshlq_v2.i
+// CHECK-LABEL: test_vqrshlq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqrshlq_v2.i
+// CHECK-LABEL: test_vqrshlq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqrshlq_v2.i
+// CHECK-LABEL: test_vqrshlq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqrshlq_v.i
+// CHECK-LABEL: test_vqrshlq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqrshlq_v2.i
+// CHECK-LABEL: test_vqrshlq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqrshlq_v2.i
+// CHECK-LABEL: test_vqrshlq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqrshlq_v2.i
+// CHECK-LABEL: test_vqrshrn_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrn_n1 = tail call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vqrshrn_n1
+// CHECK-LABEL: test_vqrshrn_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrn_n1 = tail call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vqrshrn_n1
+// CHECK-LABEL: test_vqrshrn_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrn_n1 = tail call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vqrshrn_n1
+// CHECK-LABEL: test_vqrshrn_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrn_n1 = tail call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vqrshrn_n1
+// CHECK-LABEL: test_vqrshrn_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrn_n1 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vqrshrn_n1
+// CHECK-LABEL: test_vqrshrn_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrn_n1 = tail call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vqrshrn_n1
+// CHECK-LABEL: test_vqrshrun_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrun_n1 = tail call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vqrshrun_n1
+// CHECK-LABEL: test_vqrshrun_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrun_n1 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vqrshrun_n1
+// CHECK-LABEL: test_vqrshrun_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqrshrun_n1 = tail call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vqrshrun_n1
+// CHECK-LABEL: test_vqshl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v.i = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqshl_v.i
+// CHECK-LABEL: test_vqshl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqshl_v2.i
+// CHECK-LABEL: test_vqshl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqshl_v2.i
+// CHECK-LABEL: test_vqshl_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqshl_v2.i
+// CHECK-LABEL: test_vqshl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v.i = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqshl_v.i
+// CHECK-LABEL: test_vqshl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqshl_v2.i
+// CHECK-LABEL: test_vqshl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqshl_v2.i
+// CHECK-LABEL: test_vqshl_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqshl_v2.i
+// CHECK-LABEL: test_vqshlq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqshlq_v.i
+// CHECK-LABEL: test_vqshlq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqshlq_v2.i
+// CHECK-LABEL: test_vqshlq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqshlq_v2.i
+// CHECK-LABEL: test_vqshlq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqshlq_v2.i
+// CHECK-LABEL: test_vqshlq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqshlq_v.i
+// CHECK-LABEL: test_vqshlq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqshlq_v2.i
+// CHECK-LABEL: test_vqshlq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqshlq_v2.i
+// CHECK-LABEL: test_vqshlq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqshlq_v2.i
+// CHECK-LABEL: test_vqshlu_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n = tail call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <8 x i8> %vqshlu_n
+// CHECK-LABEL: test_vqshlu_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> %a, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <4 x i16> %vqshlu_n1
+// CHECK-LABEL: test_vqshlu_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> %a, <2 x i32> <i32 1, i32 1>)
+// CHECK-NEXT:  ret <2 x i32> %vqshlu_n1
+// CHECK-LABEL: test_vqshlu_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n1 = tail call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> %a, <1 x i64> <i64 1>)
+// CHECK-NEXT:  ret <1 x i64> %vqshlu_n1
+// CHECK-LABEL: test_vqshluq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n = tail call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <16 x i8> %vqshlu_n
+// CHECK-LABEL: test_vqshluq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> %a, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <8 x i16> %vqshlu_n1
+// CHECK-LABEL: test_vqshluq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK-NEXT:  ret <4 x i32> %vqshlu_n1
+// CHECK-LABEL: test_vqshluq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshlu_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+// CHECK-NEXT:  ret <2 x i64> %vqshlu_n1
+// CHECK-LABEL: test_vqshl_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <8 x i8> %vqshl_n
+// CHECK-LABEL: test_vqshl_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <4 x i16> %vqshl_n1
+// CHECK-LABEL: test_vqshl_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 1, i32 1>)
+// CHECK-NEXT:  ret <2 x i32> %vqshl_n1
+// CHECK-LABEL: test_vqshl_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> <i64 1>)
+// CHECK-NEXT:  ret <1 x i64> %vqshl_n1
+// CHECK-LABEL: test_vqshl_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <8 x i8> %vqshl_n
+// CHECK-LABEL: test_vqshl_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <4 x i16> %vqshl_n1
+// CHECK-LABEL: test_vqshl_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 1, i32 1>)
+// CHECK-NEXT:  ret <2 x i32> %vqshl_n1
+// CHECK-LABEL: test_vqshl_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> <i64 1>)
+// CHECK-NEXT:  ret <1 x i64> %vqshl_n1
+// CHECK-LABEL: test_vqshlq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <16 x i8> %vqshl_n
+// CHECK-LABEL: test_vqshlq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <8 x i16> %vqshl_n1
+// CHECK-LABEL: test_vqshlq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK-NEXT:  ret <4 x i32> %vqshl_n1
+// CHECK-LABEL: test_vqshlq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+// CHECK-NEXT:  ret <2 x i64> %vqshl_n1
+// CHECK-LABEL: test_vqshlq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <16 x i8> %vqshl_n
+// CHECK-LABEL: test_vqshlq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <8 x i16> %vqshl_n1
+// CHECK-LABEL: test_vqshlq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK-NEXT:  ret <4 x i32> %vqshl_n1
+// CHECK-LABEL: test_vqshlq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+// CHECK-NEXT:  ret <2 x i64> %vqshl_n1
+// CHECK-LABEL: test_vqshrn_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrn_n1 = tail call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vqshrn_n1
+// CHECK-LABEL: test_vqshrn_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrn_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vqshrn_n1
+// CHECK-LABEL: test_vqshrn_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrn_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vqshrn_n1
+// CHECK-LABEL: test_vqshrn_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrn_n1 = tail call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vqshrn_n1
+// CHECK-LABEL: test_vqshrn_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrn_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vqshrn_n1
+// CHECK-LABEL: test_vqshrn_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrn_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vqshrn_n1
+// CHECK-LABEL: test_vqshrun_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrun_n1 = tail call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vqshrun_n1
+// CHECK-LABEL: test_vqshrun_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrun_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vqshrun_n1
+// CHECK-LABEL: test_vqshrun_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqshrun_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vqshrun_n1
+// CHECK-LABEL: test_vqsub_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v.i = tail call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqsub_v.i
+// CHECK-LABEL: test_vqsub_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v2.i = tail call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqsub_v2.i
+// CHECK-LABEL: test_vqsub_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v2.i = tail call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqsub_v2.i
+// CHECK-LABEL: test_vqsub_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v2.i = tail call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqsub_v2.i
+// CHECK-LABEL: test_vqsub_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v.i = tail call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vqsub_v.i
+// CHECK-LABEL: test_vqsub_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v2.i = tail call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vqsub_v2.i
+// CHECK-LABEL: test_vqsub_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v2.i = tail call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vqsub_v2.i
+// CHECK-LABEL: test_vqsub_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsub_v2.i = tail call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vqsub_v2.i
+// CHECK-LABEL: test_vqsubq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v.i = tail call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqsubq_v.i
+// CHECK-LABEL: test_vqsubq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqsubq_v2.i
+// CHECK-LABEL: test_vqsubq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqsubq_v2.i
+// CHECK-LABEL: test_vqsubq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqsubq_v2.i
+// CHECK-LABEL: test_vqsubq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v.i = tail call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vqsubq_v.i
+// CHECK-LABEL: test_vqsubq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v2.i = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vqsubq_v2.i
+// CHECK-LABEL: test_vqsubq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v2.i = tail call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vqsubq_v2.i
+// CHECK-LABEL: test_vqsubq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vqsubq_v2.i = tail call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vqsubq_v2.i
+// CHECK-LABEL: test_vraddhn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vraddhn_v2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i8> %vraddhn_v2.i
+// CHECK-LABEL: test_vraddhn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vraddhn_v2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i16> %vraddhn_v2.i
+// CHECK-LABEL: test_vraddhn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vraddhn_v2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i32> %vraddhn_v2.i
+// CHECK-LABEL: test_vraddhn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vraddhn_v2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i8> %vraddhn_v2.i
+// CHECK-LABEL: test_vraddhn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vraddhn_v2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i16> %vraddhn_v2.i
+// CHECK-LABEL: test_vraddhn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vraddhn_v2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i32> %vraddhn_v2.i
+// CHECK-LABEL: test_vrecpe_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrecpe_v1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a)
+// CHECK-NEXT:  ret <2 x float> %vrecpe_v1.i
+// CHECK-LABEL: test_vrecpe_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrecpe_v1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <2 x i32> %vrecpe_v1.i
+// CHECK-LABEL: test_vrecpeq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrecpeq_v1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a)
+// CHECK-NEXT:  ret <4 x float> %vrecpeq_v1.i
+// CHECK-LABEL: test_vrecpeq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrecpeq_v1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i32> %vrecpeq_v1.i
+// CHECK-LABEL: test_vrecps_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrecps_v2.i = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vrecps_v2.i
+// CHECK-LABEL: test_vrecpsq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrecpsq_v2.i = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x float> %vrecpsq_v2.i
+// CHECK-LABEL: test_vreinterpret_s8_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_u8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i8> %a
+// CHECK-LABEL: test_vreinterpret_s8_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s8_p8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i8> %a
+// CHECK-LABEL: test_vreinterpret_s8_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_s16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_u16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i16> %a
+// CHECK-LABEL: test_vreinterpret_s16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_s16_p16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i16> %a
+// CHECK-LABEL: test_vreinterpret_s32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  ret <2 x i32> %a
+// CHECK-LABEL: test_vreinterpret_s32_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_s64_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_u64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <1 x i64> %a
+// CHECK-LABEL: test_vreinterpret_s64_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_s64_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u8_s8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i8> %a
+// CHECK-LABEL: test_vreinterpret_u8_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u8_p8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i8> %a
+// CHECK-LABEL: test_vreinterpret_u8_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_u16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_s16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i16> %a
+// CHECK-LABEL: test_vreinterpret_u16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_u16_p16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i16> %a
+// CHECK-LABEL: test_vreinterpret_u32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  ret <2 x i32> %a
+// CHECK-LABEL: test_vreinterpret_u32_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vreinterpret_u64_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_s64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <1 x i64> %a
+// CHECK-LABEL: test_vreinterpret_u64_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_u64_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <1 x i64>
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vreinterpret_f16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f16_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %0
+// CHECK-LABEL: test_vreinterpret_f32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_f32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <2 x float>
+// CHECK-NEXT:  ret <2 x float> %0
+// CHECK-LABEL: test_vreinterpret_p8_s8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i8> %a
+// CHECK-LABEL: test_vreinterpret_p8_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_u8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i8> %a
+// CHECK-LABEL: test_vreinterpret_p8_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p8_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i16> %a to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vreinterpret_p16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_s16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i16> %a
+// CHECK-LABEL: test_vreinterpret_p16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_u16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i16> %a
+// CHECK-LABEL: test_vreinterpret_p16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i32> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <1 x i64> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x half> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x float> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpret_p16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i8> %a to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s8_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_u8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <16 x i8> %a
+// CHECK-LABEL: test_vreinterpretq_s8_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s8_p8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <16 x i8> %a
+// CHECK-LABEL: test_vreinterpretq_s8_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_s16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_u16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i16> %a
+// CHECK-LABEL: test_vreinterpretq_s16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_s16_p16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i16> %a
+// CHECK-LABEL: test_vreinterpretq_s32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i32> %a
+// CHECK-LABEL: test_vreinterpretq_s32_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_s64_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_u64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <2 x i64> %a
+// CHECK-LABEL: test_vreinterpretq_s64_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_s64_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u8_s8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <16 x i8> %a
+// CHECK-LABEL: test_vreinterpretq_u8_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u8_p8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <16 x i8> %a
+// CHECK-LABEL: test_vreinterpretq_u8_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_u16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_s16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i16> %a
+// CHECK-LABEL: test_vreinterpretq_u16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_u16_p16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i16> %a
+// CHECK-LABEL: test_vreinterpretq_u32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  ret <4 x i32> %a
+// CHECK-LABEL: test_vreinterpretq_u32_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vreinterpretq_u64_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_s64
+// CHECK:       entry:
+// CHECK-NEXT:  ret <2 x i64> %a
+// CHECK-LABEL: test_vreinterpretq_u64_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_u64_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <2 x i64>
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vreinterpretq_f16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f16_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %0
+// CHECK-LABEL: test_vreinterpretq_f32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_f32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <4 x float>
+// CHECK-NEXT:  ret <4 x float> %0
+// CHECK-LABEL: test_vreinterpretq_p8_s8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <16 x i8> %a
+// CHECK-LABEL: test_vreinterpretq_p8_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_u8
+// CHECK:       entry:
+// CHECK-NEXT:  ret <16 x i8> %a
+// CHECK-LABEL: test_vreinterpretq_p8_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p8_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x i16> %a to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vreinterpretq_p16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_s16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i16> %a
+// CHECK-LABEL: test_vreinterpretq_p16_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_u16
+// CHECK:       entry:
+// CHECK-NEXT:  ret <8 x i16> %a
+// CHECK-LABEL: test_vreinterpretq_p16_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x i32> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <2 x i64> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <8 x half> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <4 x float> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vreinterpretq_p16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast <16 x i8> %a to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vrev16_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev16_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev16_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev16q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev16q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev16q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev32_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev32_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev32q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev32q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev32q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev32q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev64_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:  ret <2 x i32> %shuffle.i
+// CHECK-LABEL: test_vrev64_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev64_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:  ret <2 x i32> %shuffle.i
+// CHECK-LABEL: test_vrev64_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:  ret <8 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev64_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:  ret <4 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:  ret <2 x float> %shuffle.i
+// CHECK-LABEL: test_vrev64q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev64q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:  ret <4 x i32> %shuffle.i
+// CHECK-LABEL: test_vrev64q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev64q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:  ret <4 x i32> %shuffle.i
+// CHECK-LABEL: test_vrev64q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:  ret <16 x i8> %shuffle.i
+// CHECK-LABEL: test_vrev64q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:  ret <8 x i16> %shuffle.i
+// CHECK-LABEL: test_vrev64q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:  ret <4 x float> %shuffle.i
+// CHECK-LABEL: test_vrhadd_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhadd_v.i = tail call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vrhadd_v.i
+// CHECK-LABEL: test_vrhadd_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vrhadd_v2.i
+// CHECK-LABEL: test_vrhadd_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vrhadd_v2.i
+// CHECK-LABEL: test_vrhadd_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhadd_v.i = tail call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vrhadd_v.i
+// CHECK-LABEL: test_vrhadd_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhadd_v2.i = tail call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vrhadd_v2.i
+// CHECK-LABEL: test_vrhadd_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhadd_v2.i = tail call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vrhadd_v2.i
+// CHECK-LABEL: test_vrhaddq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhaddq_v.i = tail call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vrhaddq_v.i
+// CHECK-LABEL: test_vrhaddq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhaddq_v2.i = tail call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vrhaddq_v2.i
+// CHECK-LABEL: test_vrhaddq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhaddq_v2.i = tail call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vrhaddq_v2.i
+// CHECK-LABEL: test_vrhaddq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhaddq_v.i = tail call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vrhaddq_v.i
+// CHECK-LABEL: test_vrhaddq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhaddq_v2.i = tail call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vrhaddq_v2.i
+// CHECK-LABEL: test_vrhaddq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrhaddq_v2.i = tail call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vrhaddq_v2.i
+// CHECK-LABEL: test_vrshl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v.i = tail call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vrshl_v.i
+// CHECK-LABEL: test_vrshl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vrshl_v2.i
+// CHECK-LABEL: test_vrshl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vrshl_v2.i
+// CHECK-LABEL: test_vrshl_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vrshl_v2.i
+// CHECK-LABEL: test_vrshl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v.i = tail call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vrshl_v.i
+// CHECK-LABEL: test_vrshl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vrshl_v2.i
+// CHECK-LABEL: test_vrshl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vrshl_v2.i
+// CHECK-LABEL: test_vrshl_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vrshl_v2.i
+// CHECK-LABEL: test_vrshlq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vrshlq_v.i
+// CHECK-LABEL: test_vrshlq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vrshlq_v2.i
+// CHECK-LABEL: test_vrshlq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vrshlq_v2.i
+// CHECK-LABEL: test_vrshlq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vrshlq_v2.i
+// CHECK-LABEL: test_vrshlq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vrshlq_v.i
+// CHECK-LABEL: test_vrshlq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vrshlq_v2.i
+// CHECK-LABEL: test_vrshlq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vrshlq_v2.i
+// CHECK-LABEL: test_vrshlq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vrshlq_v2.i
+// CHECK-LABEL: test_vrshrn_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshrn_n1 = tail call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vrshrn_n1
+// CHECK-LABEL: test_vrshrn_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshrn_n1 = tail call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vrshrn_n1
+// CHECK-LABEL: test_vrshrn_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshrn_n1 = tail call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vrshrn_n1
+// CHECK-LABEL: test_vrshrn_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshrn_n1 = tail call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vrshrn_n1
+// CHECK-LABEL: test_vrshrn_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshrn_n1 = tail call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vrshrn_n1
+// CHECK-LABEL: test_vrshrn_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshrn_n1 = tail call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vrshrn_n1
+// CHECK-LABEL: test_vrshr_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n = tail call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vrshr_n
+// CHECK-LABEL: test_vrshr_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vrshr_n1
+// CHECK-LABEL: test_vrshr_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vrshr_n1
+// CHECK-LABEL: test_vrshr_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> <i64 -1>)
+// CHECK-NEXT:  ret <1 x i64> %vrshr_n1
+// CHECK-LABEL: test_vrshr_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n = tail call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vrshr_n
+// CHECK-LABEL: test_vrshr_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vrshr_n1
+// CHECK-LABEL: test_vrshr_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vrshr_n1
+// CHECK-LABEL: test_vrshr_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> <i64 -1>)
+// CHECK-NEXT:  ret <1 x i64> %vrshr_n1
+// CHECK-LABEL: test_vrshrq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n = tail call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <16 x i8> %vrshr_n
+// CHECK-LABEL: test_vrshrq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i16> %vrshr_n1
+// CHECK-LABEL: test_vrshrq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i32> %vrshr_n1
+// CHECK-LABEL: test_vrshrq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i64> %vrshr_n1
+// CHECK-LABEL: test_vrshrq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n = tail call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <16 x i8> %vrshr_n
+// CHECK-LABEL: test_vrshrq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i16> %vrshr_n1
+// CHECK-LABEL: test_vrshrq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i32> %vrshr_n1
+// CHECK-LABEL: test_vrshrq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrshr_n1 = tail call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i64> %vrshr_n1
+// CHECK-LABEL: test_vrsqrte_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsqrte_v1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a)
+// CHECK-NEXT:  ret <2 x float> %vrsqrte_v1.i
+// CHECK-LABEL: test_vrsqrte_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsqrte_v1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
+// CHECK-NEXT:  ret <2 x i32> %vrsqrte_v1.i
+// CHECK-LABEL: test_vrsqrteq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsqrteq_v1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a)
+// CHECK-NEXT:  ret <4 x float> %vrsqrteq_v1.i
+// CHECK-LABEL: test_vrsqrteq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsqrteq_v1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
+// CHECK-NEXT:  ret <4 x i32> %vrsqrteq_v1.i
+// CHECK-LABEL: test_vrsqrts_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsqrts_v2.i = tail call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK-NEXT:  ret <2 x float> %vrsqrts_v2.i
+// CHECK-LABEL: test_vrsqrtsq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsqrtsq_v2.i = tail call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK-NEXT:  ret <4 x float> %vrsqrtsq_v2.i
+// CHECK-LABEL: test_vrsra_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  %vrsra_n = add <8 x i8> %0, %a
+// CHECK-NEXT:  ret <8 x i8> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %b, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  %vrsra_n = add <4 x i16> %0, %a
+// CHECK-NEXT:  ret <4 x i16> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %b, <2 x i32> <i32 -1, i32 -1>)
+// CHECK-NEXT:  %vrsra_n = add <2 x i32> %0, %a
+// CHECK-NEXT:  ret <2 x i32> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %b, <1 x i64> <i64 -1>)
+// CHECK-NEXT:  %vrsra_n = add <1 x i64> %0, %a
+// CHECK-NEXT:  ret <1 x i64> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  %vrsra_n = add <8 x i8> %0, %a
+// CHECK-NEXT:  ret <8 x i8> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %b, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  %vrsra_n = add <4 x i16> %0, %a
+// CHECK-NEXT:  ret <4 x i16> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %b, <2 x i32> <i32 -1, i32 -1>)
+// CHECK-NEXT:  %vrsra_n = add <2 x i32> %0, %a
+// CHECK-NEXT:  ret <2 x i32> %vrsra_n
+// CHECK-LABEL: test_vrsra_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %b, <1 x i64> <i64 -1>)
+// CHECK-NEXT:  %vrsra_n = add <1 x i64> %0, %a
+// CHECK-NEXT:  ret <1 x i64> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  %vrsra_n = add <16 x i8> %0, %a
+// CHECK-NEXT:  ret <16 x i8> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %b, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  %vrsra_n = add <8 x i16> %0, %a
+// CHECK-NEXT:  ret <8 x i16> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %b, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  %vrsra_n = add <4 x i32> %0, %a
+// CHECK-NEXT:  ret <4 x i32> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %b, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  %vrsra_n = add <2 x i64> %0, %a
+// CHECK-NEXT:  ret <2 x i64> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  %vrsra_n = add <16 x i8> %0, %a
+// CHECK-NEXT:  ret <16 x i8> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %b, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  %vrsra_n = add <8 x i16> %0, %a
+// CHECK-NEXT:  ret <8 x i16> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %b, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  %vrsra_n = add <4 x i32> %0, %a
+// CHECK-NEXT:  ret <4 x i32> %vrsra_n
+// CHECK-LABEL: test_vrsraq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = tail call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %b, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  %vrsra_n = add <2 x i64> %0, %a
+// CHECK-NEXT:  ret <2 x i64> %vrsra_n
+// CHECK-LABEL: test_vrsubhn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsubhn_v2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i8> %vrsubhn_v2.i
+// CHECK-LABEL: test_vrsubhn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsubhn_v2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i16> %vrsubhn_v2.i
+// CHECK-LABEL: test_vrsubhn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsubhn_v2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i32> %vrsubhn_v2.i
+// CHECK-LABEL: test_vrsubhn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsubhn_v2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i8> %vrsubhn_v2.i
+// CHECK-LABEL: test_vrsubhn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsubhn_v2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i16> %vrsubhn_v2.i
+// CHECK-LABEL: test_vrsubhn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vrsubhn_v2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i32> %vrsubhn_v2.i
+// CHECK-LABEL: test_vset_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK-NEXT:  ret <8 x i8> %vset_lane
+// CHECK-LABEL: test_vset_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <4 x i16> %b, i16 %a, i32 3
+// CHECK-NEXT:  ret <4 x i16> %vset_lane
+// CHECK-LABEL: test_vset_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <2 x i32> %b, i32 %a, i32 1
+// CHECK-NEXT:  ret <2 x i32> %vset_lane
+// CHECK-LABEL: test_vset_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK-NEXT:  ret <8 x i8> %vset_lane
+// CHECK-LABEL: test_vset_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <4 x i16> %b, i16 %a, i32 3
+// CHECK-NEXT:  ret <4 x i16> %vset_lane
+// CHECK-LABEL: test_vset_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <2 x i32> %b, i32 %a, i32 1
+// CHECK-NEXT:  ret <2 x i32> %vset_lane
+// CHECK-LABEL: test_vset_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK-NEXT:  ret <8 x i8> %vset_lane
+// CHECK-LABEL: test_vset_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <4 x i16> %b, i16 %a, i32 3
+// CHECK-NEXT:  ret <4 x i16> %vset_lane
+// CHECK-LABEL: test_vset_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <2 x float> %b, float %a, i32 1
+// CHECK-NEXT:  ret <2 x float> %vset_lane
+// CHECK-LABEL: test_vset_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i16*
+// CHECK-NEXT:  %1 = load i16, i16* %0, align 2, !tbaa !3
+// CHECK-NEXT:  %2 = bitcast <4 x half> %b to <4 x i16>
+// CHECK-NEXT:  %vset_lane = insertelement <4 x i16> %2, i16 %1, i32 1
+// CHECK-NEXT:  %3 = bitcast <4 x i16> %vset_lane to <4 x half>
+// CHECK-NEXT:  ret <4 x half> %3
+// CHECK-LABEL: test_vsetq_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK-NEXT:  ret <16 x i8> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i16> %b, i16 %a, i32 7
+// CHECK-NEXT:  ret <8 x i16> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <4 x i32> %b, i32 %a, i32 3
+// CHECK-NEXT:  ret <4 x i32> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK-NEXT:  ret <16 x i8> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i16> %b, i16 %a, i32 7
+// CHECK-NEXT:  ret <8 x i16> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <4 x i32> %b, i32 %a, i32 3
+// CHECK-NEXT:  ret <4 x i32> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK-NEXT:  ret <16 x i8> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i16> %b, i16 %a, i32 7
+// CHECK-NEXT:  ret <8 x i16> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <4 x float> %b, float %a, i32 3
+// CHECK-NEXT:  ret <4 x float> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i16*
+// CHECK-NEXT:  %1 = load i16, i16* %0, align 2, !tbaa !3
+// CHECK-NEXT:  %2 = bitcast <8 x half> %b to <8 x i16>
+// CHECK-NEXT:  %vset_lane = insertelement <8 x i16> %2, i16 %1, i32 3
+// CHECK-NEXT:  %3 = bitcast <8 x i16> %vset_lane to <8 x half>
+// CHECK-NEXT:  ret <8 x half> %3
+// CHECK-LABEL: test_vset_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  ret <1 x i64> %vset_lane
+// CHECK-LABEL: test_vset_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK-NEXT:  ret <1 x i64> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <2 x i64> %b, i64 %a, i32 1
+// CHECK-NEXT:  ret <2 x i64> %vset_lane
+// CHECK-LABEL: test_vsetq_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vset_lane = insertelement <2 x i64> %b, i64 %a, i32 1
+// CHECK-NEXT:  ret <2 x i64> %vset_lane
+// CHECK-LABEL: test_vshl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v.i = tail call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vshl_v.i
+// CHECK-LABEL: test_vshl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vshl_v2.i
+// CHECK-LABEL: test_vshl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vshl_v2.i
+// CHECK-LABEL: test_vshl_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vshl_v2.i
+// CHECK-LABEL: test_vshl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v.i = tail call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vshl_v.i
+// CHECK-LABEL: test_vshl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v2.i = tail call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK-NEXT:  ret <4 x i16> %vshl_v2.i
+// CHECK-LABEL: test_vshl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v2.i = tail call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK-NEXT:  ret <2 x i32> %vshl_v2.i
+// CHECK-LABEL: test_vshl_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_v2.i = tail call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK-NEXT:  ret <1 x i64> %vshl_v2.i
+// CHECK-LABEL: test_vshlq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vshlq_v.i
+// CHECK-LABEL: test_vshlq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vshlq_v2.i
+// CHECK-LABEL: test_vshlq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vshlq_v2.i
+// CHECK-LABEL: test_vshlq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vshlq_v2.i
+// CHECK-LABEL: test_vshlq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v.i = tail call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK-NEXT:  ret <16 x i8> %vshlq_v.i
+// CHECK-LABEL: test_vshlq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v2.i = tail call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK-NEXT:  ret <8 x i16> %vshlq_v2.i
+// CHECK-LABEL: test_vshlq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v2.i = tail call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK-NEXT:  ret <4 x i32> %vshlq_v2.i
+// CHECK-LABEL: test_vshlq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshlq_v2.i = tail call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK-NEXT:  ret <2 x i64> %vshlq_v2.i
+// CHECK-LABEL: test_vshll_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = sext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  %vshll_n = shl nsw <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <8 x i16> %vshll_n
+// CHECK-LABEL: test_vshll_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = sext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  %vshll_n = shl nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %vshll_n
+// CHECK-LABEL: test_vshll_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = sext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  %vshll_n = shl nsw <2 x i64> %0, <i64 1, i64 1>
+// CHECK-NEXT:  ret <2 x i64> %vshll_n
+// CHECK-LABEL: test_vshll_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = zext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  %vshll_n = shl nuw nsw <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <8 x i16> %vshll_n
+// CHECK-LABEL: test_vshll_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = zext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  %vshll_n = shl nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %vshll_n
+// CHECK-LABEL: test_vshll_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = zext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  %vshll_n = shl nuw nsw <2 x i64> %0, <i64 1, i64 1>
+// CHECK-NEXT:  ret <2 x i64> %vshll_n
+// CHECK-LABEL: test_vshl_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <8 x i8> %vshl_n
+// CHECK-LABEL: test_vshl_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <4 x i16> %a, <i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <4 x i16> %vshl_n
+// CHECK-LABEL: test_vshl_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <2 x i32> %a, <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %vshl_n
+// CHECK-LABEL: test_vshl_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <1 x i64> %a, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %vshl_n
+// CHECK-LABEL: test_vshl_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <8 x i8> %vshl_n
+// CHECK-LABEL: test_vshl_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <4 x i16> %a, <i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <4 x i16> %vshl_n
+// CHECK-LABEL: test_vshl_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <2 x i32> %a, <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %vshl_n
+// CHECK-LABEL: test_vshl_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <1 x i64> %a, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %vshl_n
+// CHECK-LABEL: test_vshlq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <16 x i8> %vshl_n
+// CHECK-LABEL: test_vshlq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <8 x i16> %vshl_n
+// CHECK-LABEL: test_vshlq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %vshl_n
+// CHECK-LABEL: test_vshlq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <2 x i64> %a, <i64 1, i64 1>
+// CHECK-NEXT:  ret <2 x i64> %vshl_n
+// CHECK-LABEL: test_vshlq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <16 x i8> %vshl_n
+// CHECK-LABEL: test_vshlq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <8 x i16> %vshl_n
+// CHECK-LABEL: test_vshlq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %vshl_n
+// CHECK-LABEL: test_vshlq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshl_n = shl <2 x i64> %a, <i64 1, i64 1>
+// CHECK-NEXT:  ret <2 x i64> %vshl_n
+// CHECK-LABEL: test_vshrn_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = lshr <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  %vshrn_n = trunc <8 x i16> %0 to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vshrn_n
+// CHECK-LABEL: test_vshrn_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = lshr <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %vshrn_n = trunc <4 x i32> %0 to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vshrn_n
+// CHECK-LABEL: test_vshrn_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = lshr <2 x i64> %a, <i64 1, i64 1>
+// CHECK-NEXT:  %vshrn_n = trunc <2 x i64> %0 to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vshrn_n
+// CHECK-LABEL: test_vshrn_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = lshr <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  %vshrn_n = trunc <8 x i16> %0 to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vshrn_n
+// CHECK-LABEL: test_vshrn_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = lshr <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %vshrn_n = trunc <4 x i32> %0 to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vshrn_n
+// CHECK-LABEL: test_vshrn_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = lshr <2 x i64> %a, <i64 1, i64 1>
+// CHECK-NEXT:  %vshrn_n = trunc <2 x i64> %0 to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vshrn_n
+// CHECK-LABEL: test_vshr_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <8 x i8> %vshr_n
+// CHECK-LABEL: test_vshr_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <4 x i16> %a, <i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <4 x i16> %vshr_n
+// CHECK-LABEL: test_vshr_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <2 x i32> %a, <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %vshr_n
+// CHECK-LABEL: test_vshr_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <1 x i64> %a, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %vshr_n
+// CHECK-LABEL: test_vshr_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <8 x i8> %vshr_n
+// CHECK-LABEL: test_vshr_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <4 x i16> %a, <i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <4 x i16> %vshr_n
+// CHECK-LABEL: test_vshr_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <2 x i32> %a, <i32 1, i32 1>
+// CHECK-NEXT:  ret <2 x i32> %vshr_n
+// CHECK-LABEL: test_vshr_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <1 x i64> %a, <i64 1>
+// CHECK-NEXT:  ret <1 x i64> %vshr_n
+// CHECK-LABEL: test_vshrq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <16 x i8> %vshr_n
+// CHECK-LABEL: test_vshrq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <8 x i16> %vshr_n
+// CHECK-LABEL: test_vshrq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %vshr_n
+// CHECK-LABEL: test_vshrq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = ashr <2 x i64> %a, <i64 1, i64 1>
+// CHECK-NEXT:  ret <2 x i64> %vshr_n
+// CHECK-LABEL: test_vshrq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  ret <16 x i8> %vshr_n
+// CHECK-LABEL: test_vshrq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  ret <8 x i16> %vshr_n
+// CHECK-LABEL: test_vshrq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  ret <4 x i32> %vshr_n
+// CHECK-LABEL: test_vshrq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vshr_n = lshr <2 x i64> %a, <i64 1, i64 1>
+// CHECK-NEXT:  ret <2 x i64> %vshr_n
+// CHECK-LABEL: test_vsli_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <8 x i8> %vsli_n
+// CHECK-LABEL: test_vsli_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <4 x i16> %vsli_n2
+// CHECK-LABEL: test_vsli_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 1>)
+// CHECK-NEXT:  ret <2 x i32> %vsli_n2
+// CHECK-LABEL: test_vsli_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> <i64 1>)
+// CHECK-NEXT:  ret <1 x i64> %vsli_n2
+// CHECK-LABEL: test_vsli_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <8 x i8> %vsli_n
+// CHECK-LABEL: test_vsli_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <4 x i16> %vsli_n2
+// CHECK-LABEL: test_vsli_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 1>)
+// CHECK-NEXT:  ret <2 x i32> %vsli_n2
+// CHECK-LABEL: test_vsli_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> <i64 1>)
+// CHECK-NEXT:  ret <1 x i64> %vsli_n2
+// CHECK-LABEL: test_vsli_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <8 x i8> %vsli_n
+// CHECK-LABEL: test_vsli_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <4 x i16> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <16 x i8> %vsli_n
+// CHECK-LABEL: test_vsliq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <8 x i16> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK-NEXT:  ret <4 x i32> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+// CHECK-NEXT:  ret <2 x i64> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <16 x i8> %vsli_n
+// CHECK-LABEL: test_vsliq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <8 x i16> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK-NEXT:  ret <4 x i32> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+// CHECK-NEXT:  ret <2 x i64> %vsli_n2
+// CHECK-LABEL: test_vsliq_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK-NEXT:  ret <16 x i8> %vsli_n
+// CHECK-LABEL: test_vsliq_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK-NEXT:  ret <8 x i16> %vsli_n2
+// CHECK-LABEL: test_vsra_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  %0 = add <8 x i8> %vsra_n, %a
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vsra_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <4 x i16> %b, <i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  %0 = add <4 x i16> %vsra_n, %a
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vsra_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <2 x i32> %b, <i32 1, i32 1>
+// CHECK-NEXT:  %0 = add <2 x i32> %vsra_n, %a
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vsra_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <1 x i64> %b, <i64 1>
+// CHECK-NEXT:  %0 = add <1 x i64> %vsra_n, %a
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vsra_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  %0 = add <8 x i8> %vsra_n, %a
+// CHECK-NEXT:  ret <8 x i8> %0
+// CHECK-LABEL: test_vsra_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <4 x i16> %b, <i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  %0 = add <4 x i16> %vsra_n, %a
+// CHECK-NEXT:  ret <4 x i16> %0
+// CHECK-LABEL: test_vsra_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <2 x i32> %b, <i32 1, i32 1>
+// CHECK-NEXT:  %0 = add <2 x i32> %vsra_n, %a
+// CHECK-NEXT:  ret <2 x i32> %0
+// CHECK-LABEL: test_vsra_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <1 x i64> %b, <i64 1>
+// CHECK-NEXT:  %0 = add <1 x i64> %vsra_n, %a
+// CHECK-NEXT:  ret <1 x i64> %0
+// CHECK-LABEL: test_vsraq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  %0 = add <16 x i8> %vsra_n, %a
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vsraq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <8 x i16> %b, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  %0 = add <8 x i16> %vsra_n, %a
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vsraq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <4 x i32> %b, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %0 = add <4 x i32> %vsra_n, %a
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vsraq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = ashr <2 x i64> %b, <i64 1, i64 1>
+// CHECK-NEXT:  %0 = add <2 x i64> %vsra_n, %a
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vsraq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK-NEXT:  %0 = add <16 x i8> %vsra_n, %a
+// CHECK-NEXT:  ret <16 x i8> %0
+// CHECK-LABEL: test_vsraq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <8 x i16> %b, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK-NEXT:  %0 = add <8 x i16> %vsra_n, %a
+// CHECK-NEXT:  ret <8 x i16> %0
+// CHECK-LABEL: test_vsraq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <4 x i32> %b, <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:  %0 = add <4 x i32> %vsra_n, %a
+// CHECK-NEXT:  ret <4 x i32> %0
+// CHECK-LABEL: test_vsraq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsra_n = lshr <2 x i64> %b, <i64 1, i64 1>
+// CHECK-NEXT:  %0 = add <2 x i64> %vsra_n, %a
+// CHECK-NEXT:  ret <2 x i64> %0
+// CHECK-LABEL: test_vsri_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vsli_n
+// CHECK-LABEL: test_vsri_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vsli_n2
+// CHECK-LABEL: test_vsri_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vsli_n2
+// CHECK-LABEL: test_vsri_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> <i64 -1>)
+// CHECK-NEXT:  ret <1 x i64> %vsli_n2
+// CHECK-LABEL: test_vsri_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vsli_n
+// CHECK-LABEL: test_vsri_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vsli_n2
+// CHECK-LABEL: test_vsri_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <2 x i32> %vsli_n2
+// CHECK-LABEL: test_vsri_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> <i64 -1>)
+// CHECK-NEXT:  ret <1 x i64> %vsli_n2
+// CHECK-LABEL: test_vsri_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <8 x i8> %vsli_n
+// CHECK-LABEL: test_vsri_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <4 x i16> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <16 x i8> %vsli_n
+// CHECK-LABEL: test_vsriq_n_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i16> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i32> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i64> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <16 x i8> %vsli_n
+// CHECK-LABEL: test_vsriq_n_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i16> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK-NEXT:  ret <4 x i32> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 -1, i64 -1>)
+// CHECK-NEXT:  ret <2 x i64> %vsli_n2
+// CHECK-LABEL: test_vsriq_n_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n = tail call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK-NEXT:  ret <16 x i8> %vsli_n
+// CHECK-LABEL: test_vsriq_n_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsli_n2 = tail call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK-NEXT:  ret <8 x i16> %vsli_n2
+// CHECK-LABEL: test_vst1q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %0, <8 x i16> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %0, <4 x i32> %b, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %0, <2 x i64> %b, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %0, <8 x i16> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %0, <4 x i32> %b, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %0, <2 x i64> %b, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %0, <8 x half> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %0, <4 x float> %b, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %0, <8 x i16> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_u8
+// CHECK:       entry:
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %0, <4 x i16> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %b, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %0, <1 x i64> %b, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_s8
+// CHECK:       entry:
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %0, <4 x i16> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %b, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %0, <1 x i64> %b, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %0, <4 x half> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %0, <2 x float> %b, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_p8
+// CHECK:       entry:
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %0, <4 x i16> %b, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <16 x i8> %b, i32 15
+// CHECK-NEXT:  store i8 %0, i8* %a, align 1
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x i16> %b, i32 7
+// CHECK-NEXT:  store i16 %0, i16* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x i32> %b, i32 3
+// CHECK-NEXT:  store i32 %0, i32* %a, align 4
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %1 = shufflevector <2 x i64> %b, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %0, <1 x i64> %1, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <16 x i8> %b, i32 15
+// CHECK-NEXT:  store i8 %0, i8* %a, align 1
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x i16> %b, i32 7
+// CHECK-NEXT:  store i16 %0, i16* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x i32> %b, i32 3
+// CHECK-NEXT:  store i32 %0, i32* %a, align 4
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = bitcast i64* %a to i8*
+// CHECK-NEXT:  %1 = shufflevector <2 x i64> %b, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %0, <1 x i64> %1, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x half> %b, i32 7
+// CHECK-NEXT:  store half %0, half* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x float> %b, i32 3
+// CHECK-NEXT:  store float %0, float* %a, align 4
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <16 x i8> %b, i32 15
+// CHECK-NEXT:  store i8 %0, i8* %a, align 1
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x i16> %b, i32 7
+// CHECK-NEXT:  store i16 %0, i16* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x i8> %b, i32 7
+// CHECK-NEXT:  store i8 %0, i8* %a, align 1
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x i16> %b, i32 3
+// CHECK-NEXT:  store i16 %0, i16* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <2 x i32> %b, i32 1
+// CHECK-NEXT:  store i32 %0, i32* %a, align 4
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <1 x i64> %b, i32 0
+// CHECK-NEXT:  store i64 %0, i64* %a, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x i8> %b, i32 7
+// CHECK-NEXT:  store i8 %0, i8* %a, align 1
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x i16> %b, i32 3
+// CHECK-NEXT:  store i16 %0, i16* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <2 x i32> %b, i32 1
+// CHECK-NEXT:  store i32 %0, i32* %a, align 4
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <1 x i64> %b, i32 0
+// CHECK-NEXT:  store i64 %0, i64* %a, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x half> %b, i32 3
+// CHECK-NEXT:  store half %0, half* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <2 x float> %b, i32 1
+// CHECK-NEXT:  store float %0, float* %a, align 4
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <8 x i8> %b, i32 7
+// CHECK-NEXT:  store i8 %0, i8* %a, align 1
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst1_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = extractelement <4 x i16> %b, i32 3
+// CHECK-NEXT:  store i16 %0, i16* %a, align 2
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %4, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %4, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8f16(i8* %4, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %4, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* %2, <2 x i32> %0, <2 x i32> %1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %b.coerce.fca.0.extract, i32 0
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %b.coerce.fca.1.extract, i32 0
+// CHECK-NEXT:  %2 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %2, <1 x i64> %0, <1 x i64> %1, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* %2, <2 x i32> %0, <2 x i32> %1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %b.coerce.fca.0.extract, i32 0
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %b.coerce.fca.1.extract, i32 0
+// CHECK-NEXT:  %2 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %2, <1 x i64> %0, <1 x i64> %1, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4f16(i8* %2, <4 x half> %0, <4 x half> %1, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* %2, <2 x float> %0, <2 x float> %1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %4, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %4, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8f16(i8* %4, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %4 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %4, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %4, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %2, <2 x i32> %0, <2 x i32> %1, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %2, <2 x i32> %0, <2 x i32> %1, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8* %2, <4 x half> %0, <4 x half> %1, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %2, <2 x float> %0, <2 x float> %1, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst2_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %2, <4 x i16> %0, <4 x i16> %1, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <8 x i8> %4, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <8 x i8> %5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <16 x i8> %b.sroa.6.32.vec.expand, <16 x i8> %b.sroa.6.40.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, <16 x i8> %b.sroa.6.40.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %6, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <8 x i8> %4, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <8 x i8> %5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <16 x i8> %b.sroa.6.32.vec.expand, <16 x i8> %b.sroa.6.40.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, <16 x i8> %b.sroa.6.40.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %6, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x half> %4, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x half> %5, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x half> %b.sroa.6.32.vec.expand, <8 x half> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8f16(i8* %6, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, <8 x half> %b.sroa.6.40.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x float> %b.sroa.6.32.vec.expand, <4 x float> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* %6, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, <4 x float> %b.sroa.6.40.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <8 x i8> %4, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <8 x i8> %5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <16 x i8> %b.sroa.6.32.vec.expand, <16 x i8> %b.sroa.6.40.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, <16 x i8> %b.sroa.6.40.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %3, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %b.coerce.fca.0.extract, i32 0
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %b.coerce.fca.1.extract, i32 0
+// CHECK-NEXT:  %2 = insertelement <1 x i64> undef, i64 %b.coerce.fca.2.extract, i32 0
+// CHECK-NEXT:  %3 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %3, <1 x i64> %0, <1 x i64> %1, <1 x i64> %2, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %3, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %b.coerce.fca.0.extract, i32 0
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %b.coerce.fca.1.extract, i32 0
+// CHECK-NEXT:  %2 = insertelement <1 x i64> undef, i64 %b.coerce.fca.2.extract, i32 0
+// CHECK-NEXT:  %3 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %3, <1 x i64> %0, <1 x i64> %1, <1 x i64> %2, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %3 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4f16(i8* %3, <4 x half> %0, <4 x half> %1, <4 x half> %2, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %3 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* %3, <2 x float> %0, <2 x float> %1, <2 x float> %2, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %6, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %6, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x half> %4, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x half> %5, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x half> %b.sroa.6.32.vec.expand, <8 x half> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8f16(i8* %6, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, <8 x half> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x float> %b.sroa.6.32.vec.expand, <4 x float> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %6 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %6, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, <4 x float> %b.sroa.6.40.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [6 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [6 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [6 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [6 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [6 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [6 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %6 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %6, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %3, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %3, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %3 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4f16(i8* %3, <4 x half> %0, <4 x half> %1, <4 x half> %2, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %3 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %3, <2 x float> %0, <2 x float> %1, <2 x float> %2, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst3_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %3, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <8 x i8> %4, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <8 x i8> %5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <16 x i8> %b.sroa.6.32.vec.expand, <16 x i8> %b.sroa.6.40.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <8 x i8> %6, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <8 x i8> %7, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <16 x i8> %b.sroa.9.48.vec.expand, <16 x i8> %b.sroa.9.56.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, <16 x i8> %b.sroa.6.40.vecblend, <16 x i8> %b.sroa.9.56.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x i32> %7, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x i32> %b.sroa.9.48.vec.expand, <4 x i32> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %8, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, <4 x i32> %b.sroa.9.56.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <8 x i8> %4, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <8 x i8> %5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <16 x i8> %b.sroa.6.32.vec.expand, <16 x i8> %b.sroa.6.40.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <8 x i8> %6, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <8 x i8> %7, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <16 x i8> %b.sroa.9.48.vec.expand, <16 x i8> %b.sroa.9.56.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, <16 x i8> %b.sroa.6.40.vecblend, <16 x i8> %b.sroa.9.56.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x i32> %7, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x i32> %b.sroa.9.48.vec.expand, <4 x i32> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %8, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, <4 x i32> %b.sroa.9.56.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x half> %4, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x half> %5, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x half> %b.sroa.6.32.vec.expand, <8 x half> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x half> %6, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x half> %7, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x half> %b.sroa.9.48.vec.expand, <8 x half> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8f16(i8* %8, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, <8 x half> %b.sroa.6.40.vecblend, <8 x half> %b.sroa.9.56.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x float> %b.sroa.6.32.vec.expand, <4 x float> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x float> %6, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x float> %7, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x float> %b.sroa.9.48.vec.expand, <4 x float> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %8, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, <4 x float> %b.sroa.6.40.vecblend, <4 x float> %b.sroa.9.56.vecblend, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <16 x i8> %b.sroa.0.0.vec.expand, <16 x i8> %b.sroa.0.8.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <8 x i8> %2, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <8 x i8> %3, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <16 x i8> %b.sroa.3.16.vec.expand, <16 x i8> %b.sroa.3.24.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <8 x i8> %4, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <8 x i8> %5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <16 x i8> %b.sroa.6.32.vec.expand, <16 x i8> %b.sroa.6.40.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <8 x i8> %6, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <8 x i8>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <8 x i8> %7, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <16 x i8> %b.sroa.9.48.vec.expand, <16 x i8> %b.sroa.9.56.vec.expand, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> %b.sroa.0.8.vecblend, <16 x i8> %b.sroa.3.24.vecblend, <16 x i8> %b.sroa.6.40.vecblend, <16 x i8> %b.sroa.9.56.vecblend, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* %4, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %b.coerce.fca.0.extract, i32 0
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %b.coerce.fca.1.extract, i32 0
+// CHECK-NEXT:  %2 = insertelement <1 x i64> undef, i64 %b.coerce.fca.2.extract, i32 0
+// CHECK-NEXT:  %3 = insertelement <1 x i64> undef, i64 %b.coerce.fca.3.extract, i32 0
+// CHECK-NEXT:  %4 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %4, <1 x i64> %0, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* %4, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = insertelement <1 x i64> undef, i64 %b.coerce.fca.0.extract, i32 0
+// CHECK-NEXT:  %1 = insertelement <1 x i64> undef, i64 %b.coerce.fca.1.extract, i32 0
+// CHECK-NEXT:  %2 = insertelement <1 x i64> undef, i64 %b.coerce.fca.2.extract, i32 0
+// CHECK-NEXT:  %3 = insertelement <1 x i64> undef, i64 %b.coerce.fca.3.extract, i32 0
+// CHECK-NEXT:  %4 = bitcast i64* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %4, <1 x i64> %0, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %4 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4f16(i8* %4, <4 x half> %0, <4 x half> %1, <4 x half> %2, <4 x half> %3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %4 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* %4, <2 x float> %0, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x i32> %7, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x i32> %b.sroa.9.48.vec.expand, <4 x i32> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %8, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, <4 x i32> %b.sroa.9.56.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x i32> %b.sroa.0.0.vec.expand, <4 x i32> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x i32> %3, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x i32> %b.sroa.3.16.vec.expand, <4 x i32> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x i32> %b.sroa.6.32.vec.expand, <4 x i32> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x i32>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x i32> %7, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x i32> %b.sroa.9.48.vec.expand, <4 x i32> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %8, <4 x i32> %b.sroa.0.8.vecblend, <4 x i32> %b.sroa.3.24.vecblend, <4 x i32> %b.sroa.6.40.vecblend, <4 x i32> %b.sroa.9.56.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x half> %0, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x half> %1, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x half> %b.sroa.0.0.vec.expand, <8 x half> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x half> %2, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x half> %3, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x half> %b.sroa.3.16.vec.expand, <8 x half> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x half> %4, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x half> %5, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x half> %b.sroa.6.32.vec.expand, <8 x half> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x half> %6, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x half>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x half> %7, <4 x half> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x half> %b.sroa.9.48.vec.expand, <8 x half> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8f16(i8* %8, <8 x half> %b.sroa.0.8.vecblend, <8 x half> %b.sroa.3.24.vecblend, <8 x half> %b.sroa.6.40.vecblend, <8 x half> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <4 x float> %b.sroa.0.0.vec.expand, <4 x float> %b.sroa.0.8.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <4 x float> %b.sroa.3.16.vec.expand, <4 x float> %b.sroa.3.24.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <4 x float> %b.sroa.6.32.vec.expand, <4 x float> %b.sroa.6.40.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <2 x float> %6, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <2 x float>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <2 x float> %7, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <4 x float> %b.sroa.9.48.vec.expand, <4 x float> %b.sroa.9.56.vec.expand, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+// CHECK-NEXT:  %8 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %8, <4 x float> %b.sroa.0.8.vecblend, <4 x float> %b.sroa.3.24.vecblend, <4 x float> %b.sroa.6.40.vecblend, <4 x float> %b.sroa.9.56.vecblend, i32 3, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4q_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [8 x i64] %b.coerce, 0
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.0.vec.expand = shufflevector <4 x i16> %0, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [8 x i64] %b.coerce, 1
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.0.8.vec.expand = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.0.8.vecblend = shufflevector <8 x i16> %b.sroa.0.0.vec.expand, <8 x i16> %b.sroa.0.8.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [8 x i64] %b.coerce, 2
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.16.vec.expand = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [8 x i64] %b.coerce, 3
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.3.24.vec.expand = shufflevector <4 x i16> %3, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.3.24.vecblend = shufflevector <8 x i16> %b.sroa.3.16.vec.expand, <8 x i16> %b.sroa.3.24.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.4.extract = extractvalue [8 x i64] %b.coerce, 4
+// CHECK-NEXT:  %4 = bitcast i64 %b.coerce.fca.4.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.32.vec.expand = shufflevector <4 x i16> %4, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.5.extract = extractvalue [8 x i64] %b.coerce, 5
+// CHECK-NEXT:  %5 = bitcast i64 %b.coerce.fca.5.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.6.40.vec.expand = shufflevector <4 x i16> %5, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.6.40.vecblend = shufflevector <8 x i16> %b.sroa.6.32.vec.expand, <8 x i16> %b.sroa.6.40.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %b.coerce.fca.6.extract = extractvalue [8 x i64] %b.coerce, 6
+// CHECK-NEXT:  %6 = bitcast i64 %b.coerce.fca.6.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.48.vec.expand = shufflevector <4 x i16> %6, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:  %b.coerce.fca.7.extract = extractvalue [8 x i64] %b.coerce, 7
+// CHECK-NEXT:  %7 = bitcast i64 %b.coerce.fca.7.extract to <4 x i16>
+// CHECK-NEXT:  %b.sroa.9.56.vec.expand = shufflevector <4 x i16> %7, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:  %b.sroa.9.56.vecblend = shufflevector <8 x i16> %b.sroa.9.48.vec.expand, <8 x i16> %b.sroa.9.56.vec.expand, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:  %8 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %8, <8 x i16> %b.sroa.0.8.vecblend, <8 x i16> %b.sroa.3.24.vecblend, <8 x i16> %b.sroa.6.40.vecblend, <8 x i16> %b.sroa.9.56.vecblend, i32 7, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %4, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x i32>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x i32>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x i32>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x i32>
+// CHECK-NEXT:  %4 = bitcast i32* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %4, <2 x i32> %0, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_f16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x half>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x half>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x half>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x half>
+// CHECK-NEXT:  %4 = bitcast half* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4f16(i8* %4, <4 x half> %0, <4 x half> %1, <4 x half> %2, <4 x half> %3, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <2 x float>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <2 x float>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <2 x float>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <2 x float>
+// CHECK-NEXT:  %4 = bitcast float* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %4, <2 x float> %0, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <8 x i8>
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 7, i32 1)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vst4_lane_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %b.coerce.fca.0.extract = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %b.coerce.fca.1.extract = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %b.coerce.fca.2.extract = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %b.coerce.fca.3.extract = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %b.coerce.fca.0.extract to <4 x i16>
+// CHECK-NEXT:  %1 = bitcast i64 %b.coerce.fca.1.extract to <4 x i16>
+// CHECK-NEXT:  %2 = bitcast i64 %b.coerce.fca.2.extract to <4 x i16>
+// CHECK-NEXT:  %3 = bitcast i64 %b.coerce.fca.3.extract to <4 x i16>
+// CHECK-NEXT:  %4 = bitcast i16* %a to i8*
+// CHECK-NEXT:  tail call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %4, <4 x i16> %0, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vsub_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <8 x i8> %a, %b
+// CHECK-NEXT:  ret <8 x i8> %sub.i
+// CHECK-LABEL: test_vsub_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <4 x i16> %a, %b
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vsub_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <2 x i32> %a, %b
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vsub_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <1 x i64> %a, %b
+// CHECK-NEXT:  ret <1 x i64> %sub.i
+// CHECK-LABEL: test_vsub_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = fsub <2 x float> %a, %b
+// CHECK-NEXT:  ret <2 x float> %sub.i
+// CHECK-LABEL: test_vsub_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <8 x i8> %a, %b
+// CHECK-NEXT:  ret <8 x i8> %sub.i
+// CHECK-LABEL: test_vsub_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <4 x i16> %a, %b
+// CHECK-NEXT:  ret <4 x i16> %sub.i
+// CHECK-LABEL: test_vsub_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <2 x i32> %a, %b
+// CHECK-NEXT:  ret <2 x i32> %sub.i
+// CHECK-LABEL: test_vsub_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <1 x i64> %a, %b
+// CHECK-NEXT:  ret <1 x i64> %sub.i
+// CHECK-LABEL: test_vsubq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <16 x i8> %a, %b
+// CHECK-NEXT:  ret <16 x i8> %sub.i
+// CHECK-LABEL: test_vsubq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %b
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vsubq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %b
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vsubq_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %b
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vsubq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = fsub <4 x float> %a, %b
+// CHECK-NEXT:  ret <4 x float> %sub.i
+// CHECK-LABEL: test_vsubq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <16 x i8> %a, %b
+// CHECK-NEXT:  ret <16 x i8> %sub.i
+// CHECK-LABEL: test_vsubq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %b
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vsubq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %b
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vsubq_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %b
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vsubhn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsubhn.i = sub <8 x i16> %a, %b
+// CHECK-NEXT:  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK-NEXT:  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vsubhn2.i
+// CHECK-LABEL: test_vsubhn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsubhn.i = sub <4 x i32> %a, %b
+// CHECK-NEXT:  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+// CHECK-NEXT:  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vsubhn2.i
+// CHECK-LABEL: test_vsubhn_s64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsubhn.i = sub <2 x i64> %a, %b
+// CHECK-NEXT:  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+// CHECK-NEXT:  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vsubhn2.i
+// CHECK-LABEL: test_vsubhn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vsubhn.i = sub <8 x i16> %a, %b
+// CHECK-NEXT:  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK-NEXT:  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vsubhn2.i
+// CHECK-LABEL: test_vsubhn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vsubhn.i = sub <4 x i32> %a, %b
+// CHECK-NEXT:  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+// CHECK-NEXT:  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vsubhn2.i
+// CHECK-LABEL: test_vsubhn_u64
+// CHECK:       entry:
+// CHECK-NEXT:  %vsubhn.i = sub <2 x i64> %a, %b
+// CHECK-NEXT:  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+// CHECK-NEXT:  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vsubhn2.i
+// CHECK-LABEL: test_vsubl_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  %vmovl.i3.i = sext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %sub.i = sub nsw <8 x i16> %vmovl.i.i, %vmovl.i3.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vsubl_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  %vmovl.i3.i = sext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %sub.i = sub nsw <4 x i32> %vmovl.i.i, %vmovl.i3.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vsubl_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  %vmovl.i3.i = sext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %sub.i = sub nsw <2 x i64> %vmovl.i.i, %vmovl.i3.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vsubl_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+// CHECK-NEXT:  %vmovl.i3.i = zext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %sub.i = sub nsw <8 x i16> %vmovl.i.i, %vmovl.i3.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vsubl_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+// CHECK-NEXT:  %vmovl.i3.i = zext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %sub.i = sub nsw <4 x i32> %vmovl.i.i, %vmovl.i3.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vsubl_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+// CHECK-NEXT:  %vmovl.i3.i = zext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %sub.i = sub nsw <2 x i64> %vmovl.i.i, %vmovl.i3.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vsubw_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vsubw_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vsubw_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vsubw_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+// CHECK-NEXT:  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+// CHECK-NEXT:  ret <8 x i16> %sub.i
+// CHECK-LABEL: test_vsubw_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+// CHECK-NEXT:  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+// CHECK-NEXT:  ret <4 x i32> %sub.i
+// CHECK-LABEL: test_vsubw_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+// CHECK-NEXT:  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+// CHECK-NEXT:  ret <2 x i64> %sub.i
+// CHECK-LABEL: test_vtbl1_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vtbl1.i = tail call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl1.i
+// CHECK-LABEL: test_vtbl1_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vtbl1.i = tail call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl1.i
+// CHECK-LABEL: test_vtbl1_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vtbl1.i = tail call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl1.i
+// CHECK-LABEL: test_vtbl2_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [2 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [2 x i64] %a.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl2.i = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %0, <8 x i8> %1, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl2.i
+// CHECK-LABEL: test_vtbl2_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [2 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [2 x i64] %a.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl2.i = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %0, <8 x i8> %1, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl2.i
+// CHECK-LABEL: test_vtbl2_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [2 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [2 x i64] %a.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl2.i = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %0, <8 x i8> %1, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl2.i
+// CHECK-LABEL: test_vtbl3_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [3 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [3 x i64] %a.coerce, 1
+// CHECK-NEXT:  %__p0.coerce.fca.2.extract.i = extractvalue [3 x i64] %a.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p0.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl3.i = tail call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl3.i
+// CHECK-LABEL: test_vtbl3_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [3 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [3 x i64] %a.coerce, 1
+// CHECK-NEXT:  %__p0.coerce.fca.2.extract.i = extractvalue [3 x i64] %a.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p0.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl3.i = tail call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl3.i
+// CHECK-LABEL: test_vtbl3_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [3 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [3 x i64] %a.coerce, 1
+// CHECK-NEXT:  %__p0.coerce.fca.2.extract.i = extractvalue [3 x i64] %a.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p0.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl3.i = tail call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl3.i
+// CHECK-LABEL: test_vtbl4_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [4 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [4 x i64] %a.coerce, 1
+// CHECK-NEXT:  %__p0.coerce.fca.2.extract.i = extractvalue [4 x i64] %a.coerce, 2
+// CHECK-NEXT:  %__p0.coerce.fca.3.extract.i = extractvalue [4 x i64] %a.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p0.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %__p0.coerce.fca.3.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl4.i = tail call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl4.i
+// CHECK-LABEL: test_vtbl4_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [4 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [4 x i64] %a.coerce, 1
+// CHECK-NEXT:  %__p0.coerce.fca.2.extract.i = extractvalue [4 x i64] %a.coerce, 2
+// CHECK-NEXT:  %__p0.coerce.fca.3.extract.i = extractvalue [4 x i64] %a.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p0.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %__p0.coerce.fca.3.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl4.i = tail call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl4.i
+// CHECK-LABEL: test_vtbl4_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p0.coerce.fca.0.extract.i = extractvalue [4 x i64] %a.coerce, 0
+// CHECK-NEXT:  %__p0.coerce.fca.1.extract.i = extractvalue [4 x i64] %a.coerce, 1
+// CHECK-NEXT:  %__p0.coerce.fca.2.extract.i = extractvalue [4 x i64] %a.coerce, 2
+// CHECK-NEXT:  %__p0.coerce.fca.3.extract.i = extractvalue [4 x i64] %a.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %__p0.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p0.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p0.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %__p0.coerce.fca.3.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbl4.i = tail call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %b)
+// CHECK-NEXT:  ret <8 x i8> %vtbl4.i
+// CHECK-LABEL: test_vtbx1_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %vtbx1.i = tail call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx1.i
+// CHECK-LABEL: test_vtbx1_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %vtbx1.i = tail call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx1.i
+// CHECK-LABEL: test_vtbx1_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %vtbx1.i = tail call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx1.i
+// CHECK-LABEL: test_vtbx2_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx2.i = tail call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx2.i
+// CHECK-LABEL: test_vtbx2_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx2.i = tail call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx2.i
+// CHECK-LABEL: test_vtbx2_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [2 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [2 x i64] %b.coerce, 1
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx2.i = tail call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx2.i
+// CHECK-LABEL: test_vtbx3_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %__p1.coerce.fca.2.extract.i = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p1.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx3.i = tail call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx3.i
+// CHECK-LABEL: test_vtbx3_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %__p1.coerce.fca.2.extract.i = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p1.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx3.i = tail call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx3.i
+// CHECK-LABEL: test_vtbx3_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [3 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [3 x i64] %b.coerce, 1
+// CHECK-NEXT:  %__p1.coerce.fca.2.extract.i = extractvalue [3 x i64] %b.coerce, 2
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p1.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx3.i = tail call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx3.i
+// CHECK-LABEL: test_vtbx4_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %__p1.coerce.fca.2.extract.i = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %__p1.coerce.fca.3.extract.i = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p1.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %__p1.coerce.fca.3.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx4.i = tail call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx4.i
+// CHECK-LABEL: test_vtbx4_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %__p1.coerce.fca.2.extract.i = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %__p1.coerce.fca.3.extract.i = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p1.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %__p1.coerce.fca.3.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx4.i = tail call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx4.i
+// CHECK-LABEL: test_vtbx4_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %__p1.coerce.fca.0.extract.i = extractvalue [4 x i64] %b.coerce, 0
+// CHECK-NEXT:  %__p1.coerce.fca.1.extract.i = extractvalue [4 x i64] %b.coerce, 1
+// CHECK-NEXT:  %__p1.coerce.fca.2.extract.i = extractvalue [4 x i64] %b.coerce, 2
+// CHECK-NEXT:  %__p1.coerce.fca.3.extract.i = extractvalue [4 x i64] %b.coerce, 3
+// CHECK-NEXT:  %0 = bitcast i64 %__p1.coerce.fca.0.extract.i to <8 x i8>
+// CHECK-NEXT:  %1 = bitcast i64 %__p1.coerce.fca.1.extract.i to <8 x i8>
+// CHECK-NEXT:  %2 = bitcast i64 %__p1.coerce.fca.2.extract.i to <8 x i8>
+// CHECK-NEXT:  %3 = bitcast i64 %__p1.coerce.fca.3.extract.i to <8 x i8>
+// CHECK-NEXT:  %vtbx4.i = tail call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> %0, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %c)
+// CHECK-NEXT:  ret <8 x i8> %vtbx4.i
+// CHECK-LABEL: test_vtrn_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:  store <8 x i8> %vtrn.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vtrn1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:  store <4 x i16> %vtrn.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vtrn1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x i32> %vtrn.i, <2 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x i32> %vtrn1.i, <2 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:  store <8 x i8> %vtrn.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vtrn1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:  store <4 x i16> %vtrn.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vtrn1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x i32> %vtrn.i, <2 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x i32> %vtrn1.i, <2 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x float> %vtrn.i, <2 x float>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x float> %vtrn1.i, <2 x float>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:  store <8 x i8> %vtrn.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vtrn1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrn_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:  store <4 x i16> %vtrn.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vtrn1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:  store <16 x i8> %vtrn.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vtrn1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:  store <8 x i16> %vtrn.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vtrn1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:  store <4 x i32> %vtrn.i, <4 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i32> %vtrn1.i, <4 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:  store <16 x i8> %vtrn.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vtrn1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:  store <8 x i16> %vtrn.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vtrn1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:  store <4 x i32> %vtrn.i, <4 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i32> %vtrn1.i, <4 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:  store <4 x float> %vtrn.i, <4 x float>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x float> %vtrn1.i, <4 x float>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:  store <16 x i8> %vtrn.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vtrn1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtrnq_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:  store <8 x i16> %vtrn.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vtrn1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vtst_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <8 x i8> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <8 x i8> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <8 x i1> %1 to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vtst.i
+// CHECK-LABEL: test_vtst_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <4 x i16> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <4 x i16> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <4 x i1> %1 to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vtst.i
+// CHECK-LABEL: test_vtst_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <2 x i32> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <2 x i32> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <2 x i1> %1 to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vtst.i
+// CHECK-LABEL: test_vtst_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <8 x i8> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <8 x i8> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <8 x i1> %1 to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vtst.i
+// CHECK-LABEL: test_vtst_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <4 x i16> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <4 x i16> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <4 x i1> %1 to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vtst.i
+// CHECK-LABEL: test_vtst_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <2 x i32> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <2 x i32> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <2 x i1> %1 to <2 x i32>
+// CHECK-NEXT:  ret <2 x i32> %vtst.i
+// CHECK-LABEL: test_vtst_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <8 x i8> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <8 x i8> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <8 x i1> %1 to <8 x i8>
+// CHECK-NEXT:  ret <8 x i8> %vtst.i
+// CHECK-LABEL: test_vtst_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <4 x i16> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <4 x i16> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <4 x i1> %1 to <4 x i16>
+// CHECK-NEXT:  ret <4 x i16> %vtst.i
+// CHECK-LABEL: test_vtstq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <16 x i8> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <16 x i8> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <16 x i1> %1 to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %vtst.i
+// CHECK-LABEL: test_vtstq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <8 x i16> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <8 x i16> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <8 x i1> %1 to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vtst.i
+// CHECK-LABEL: test_vtstq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <4 x i32> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <4 x i32> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <4 x i1> %1 to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vtst.i
+// CHECK-LABEL: test_vtstq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <16 x i8> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <16 x i8> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <16 x i1> %1 to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %vtst.i
+// CHECK-LABEL: test_vtstq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <8 x i16> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <8 x i16> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <8 x i1> %1 to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vtst.i
+// CHECK-LABEL: test_vtstq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <4 x i32> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <4 x i32> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <4 x i1> %1 to <4 x i32>
+// CHECK-NEXT:  ret <4 x i32> %vtst.i
+// CHECK-LABEL: test_vtstq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <16 x i8> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <16 x i8> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <16 x i1> %1 to <16 x i8>
+// CHECK-NEXT:  ret <16 x i8> %vtst.i
+// CHECK-LABEL: test_vtstq_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = and <8 x i16> %b, %a
+// CHECK-NEXT:  %1 = icmp ne <8 x i16> %0, zeroinitializer
+// CHECK-NEXT:  %vtst.i = sext <8 x i1> %1 to <8 x i16>
+// CHECK-NEXT:  ret <8 x i16> %vtst.i
+// CHECK-LABEL: test_vuzp_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:  store <8 x i8> %vuzp.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vuzp1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:  store <4 x i16> %vuzp.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vuzp1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x i32> %vuzp.i, <2 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x i32> %vuzp1.i, <2 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:  store <8 x i8> %vuzp.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vuzp1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:  store <4 x i16> %vuzp.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vuzp1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x i32> %vuzp.i, <2 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x i32> %vuzp1.i, <2 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x float> %vuzp.i, <2 x float>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x float> %vuzp1.i, <2 x float>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:  store <8 x i8> %vuzp.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vuzp1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzp_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:  store <4 x i16> %vuzp.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vuzp1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:  store <16 x i8> %vuzp.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vuzp1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:  store <8 x i16> %vuzp.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vuzp1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:  store <4 x i32> %vuzp.i, <4 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:  store <4 x i32> %vuzp1.i, <4 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:  store <16 x i8> %vuzp.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vuzp1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:  store <8 x i16> %vuzp.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vuzp1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:  store <4 x i32> %vuzp.i, <4 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:  store <4 x i32> %vuzp1.i, <4 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:  store <4 x float> %vuzp.i, <4 x float>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:  store <4 x float> %vuzp1.i, <4 x float>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:  store <16 x i8> %vuzp.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vuzp1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vuzpq_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:  store <8 x i16> %vuzp.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vuzp1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:  store <8 x i8> %vzip.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vzip1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:  store <4 x i16> %vzip.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vzip1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x i32> %vzip.i, <2 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x i32> %vzip1.i, <2 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:  store <8 x i8> %vzip.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vzip1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:  store <4 x i16> %vzip.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vzip1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x i32> %vzip.i, <2 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x i32> %vzip1.i, <2 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:  store <2 x float> %vzip.i, <2 x float>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:  store <2 x float> %vzip1.i, <2 x float>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:  store <8 x i8> %vzip.i, <8 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i8> %vzip1.i, <8 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzip_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:  store <4 x i16> %vzip.i, <4 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i16> %vzip1.i, <4 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_s8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:  store <16 x i8> %vzip.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vzip1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_s16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:  store <8 x i16> %vzip.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vzip1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_s32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:  store <4 x i32> %vzip.i, <4 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i32> %vzip1.i, <4 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_u8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:  store <16 x i8> %vzip.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vzip1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_u16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:  store <8 x i16> %vzip.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vzip1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_u32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:  store <4 x i32> %vzip.i, <4 x i32>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x i32> %vzip1.i, <4 x i32>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_f32
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:  store <4 x float> %vzip.i, <4 x float>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:  store <4 x float> %vzip1.i, <4 x float>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_p8
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:  store <16 x i8> %vzip.i, <16 x i8>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:  store <16 x i8> %vzip1.i, <16 x i8>* %1, align 8
+// CHECK-NEXT:  ret void
+// CHECK-LABEL: test_vzipq_p16
+// CHECK:       entry:
+// CHECK-NEXT:  %0 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 0
+// CHECK-NEXT:  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:  store <8 x i16> %vzip.i, <8 x i16>* %0, align 8
+// CHECK-NEXT:  %1 = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* %agg.result, i32 0, i32 0, i32 1
+// CHECK-NEXT:  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:  store <8 x i16> %vzip1.i, <8 x i16>* %1, align 8
+// CHECK-NEXT:  ret void