Index: include/llvm/IR/IntrinsicsX86.td
===================================================================
--- include/llvm/IR/IntrinsicsX86.td
+++ include/llvm/IR/IntrinsicsX86.td
@@ -264,12 +264,6 @@
               Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
-// Misc.
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_movmsk_ps : GCCBuiltin<"__builtin_ia32_movmskps">,
-              Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE2
 
@@ -490,10 +484,6 @@
   def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">,
-              Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">,
-              Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_sse2_maskmov_dqu : GCCBuiltin<"__builtin_ia32_maskmovdqu">,
               Intrinsic<[], [llvm_v16i8_ty,
                          llvm_v16i8_ty, llvm_ptr_ty], []>;
@@ -1466,14 +1456,6 @@
           [IntrNoMem]>;
 }
 
-// Vector extract sign mask
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_movmsk_pd_256 : GCCBuiltin<"__builtin_ia32_movmskpd256">,
-        Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_movmsk_ps_256 : GCCBuiltin<"__builtin_ia32_movmskps256">,
-        Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
-}
-
 // Vector zero
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_vzeroall : GCCBuiltin<"__builtin_ia32_vzeroall">,
@@ -2075,8 +2057,6 @@
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
-              Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem]>;
Index: lib/IR/AutoUpgrade.cpp
===================================================================
--- lib/IR/AutoUpgrade.cpp
+++ lib/IR/AutoUpgrade.cpp
@@ -203,6 +203,12 @@
       Name.startswith("sse41.pmovzx") || // Added in 3.9
       Name.startswith("avx2.pmovsx") || // Added in 3.9
       Name.startswith("avx2.pmovzx") || // Added in 3.9
+      Name.startswith("sse.movmsk.ps") || // Added in 6.0
+      Name.startswith("sse2.movmsk.pd") || // Added in 6.0
+      Name.startswith("sse2.pmovmskb.128") || // Added in 6.0
+      Name.startswith("avx.movmsk.pd.256") || // Added in 6.0
+      Name.startswith("avx.movmsk.ps.256") || // Added in 6.0
+      Name.startswith("avx2.pmovmskb") || // Added in 6.0
       Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
       Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
       Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
@@ -910,6 +916,29 @@
   return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
 }
 
+static Value *EmitX86Mask(IRBuilder<> &Builder, ArrayRef<Value *> Ops) {
+  Type *Typ1 = Ops[0]->getType();
+  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, Ops[0],
+                                  ConstantVector::getNullValue(Typ1));
+  Value *BitCast = Builder.CreateBitCast(
+      Cmp, Type::getIntNTy(Builder.getContext(), Typ1->getVectorNumElements()));
+  return (Typ1->getVectorNumElements() < 32)
+             ? Builder.CreateZExt(BitCast,
+                                  Type::getInt32Ty(Builder.getContext()))
+             : BitCast;
+}
+
+/// Converting floating point (32/64 bit) into integer.
+static Value *EmitX86MaskFloat(IRBuilder<> &Builder, ArrayRef<Value *> Ops) {
+  unsigned DstTypEle = Ops[0]->getType()->getVectorNumElements();
+  Type *New = Ops[0]->getType()->getScalarSizeInBits() == 32
+                  ? Type::getInt32Ty(Builder.getContext())
+                  : Type::getInt64Ty(Builder.getContext());
+  Type *DstTyp = VectorType::get(New, DstTypEle);
+  Value *BitCastFloatToInt = Builder.CreateBitCast(Ops[0], DstTyp);
+  return EmitX86Mask(Builder, BitCastFloatToInt);
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -1278,6 +1307,14 @@
       if (CI->getNumArgOperands() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx2.pmovmskb")||
+                        (Name.startswith("sse2.pmovmskb.128")))) {
+      Rep = EmitX86Mask(Builder, CI->getOperand(0));
+    } else if (IsX86 && (Name.startswith("sse.movmsk.ps") ||
+                         Name.startswith("sse2.movmsk.pd") ||
+                         Name.startswith("avx.movmsk.pd.256") ||
+                         Name.startswith("avx.movmsk.ps.256"))) {
+      Rep = EmitX86MaskFloat(Builder, CI->getOperand(0));
     } else if (IsX86 && (Name.startswith("avx.vbroadcastf128") ||
                          Name == "avx2.vbroadcasti128")) {
       // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -30124,6 +30124,18 @@
                        DAG.getBitcast(MVT::v2i64, Res));
   }
 
+  // Combine (bitcast(SETCC Vec , All zero Vec , SETLT),i32/i64) into
+  // X86ISDMOVMSK Vec.
+  if (N0->getOpcode() == ISD::SETCC && !VT.isFloatingPoint() &&
+      !VT.isVector()) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+    if (!LHS.isUndef() && isNullConstant(RHS.getOperand(0)) &&
+        CC == ISD::CondCode::SETLT)
+      return DAG.getNode(X86ISD::MOVMSK, SDLoc(N0), VT, LHS);
+  }
+
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
@@ -30159,7 +30171,7 @@
     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
   }
-
+  
   return SDValue();
 }
 
@@ -35693,6 +35705,26 @@
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  // Combine (i32 zext(bitcast(setcc(bitcast(vec1),vec2)))) => MOVMSK vec1
+  // Combine (i64 zext(bitcast(setcc(bitcast(vec1),vec2)))) =>
+  // (i64 zext(MOVMSK vec1))
+  if ((VT == MVT::i32 || VT == MVT::i64) && N0.getOpcode() == ISD::BITCAST &&
+      N0.getOperand(0).getOpcode() == ISD::SETCC) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOperand(0).getOpcode() == ISD::BITCAST &&
+        N00.getOperand(0).getOperand(0).getValueType().isFloatingPoint()) {
+      SDValue MaskI32 =
+          DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
+                      N0.getOperand(0).getOperand(0).getOperand(0));
+      return (VT == MVT::i32)
+                 ? MaskI32
+                 : DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, MaskI32);
+    }
+    if (VT == MVT::i32 &&
+        N00.getOperand(0).getValueType().getScalarSizeInBits() == 8)
+      return DAG.getNode(X86ISD::MOVMSK, dl, VT, N00.getOperand(0));
+  }
+
   if (N0.getOpcode() == ISD::AND &&
       N0.hasOneUse() &&
       N0.getOperand(0).hasOneUse()) {
@@ -35740,7 +35772,7 @@
 
   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
     return R;
-
+  
   return SDValue();
 }
 
@@ -36718,6 +36750,26 @@
 
   return SDValue();
 }
+// Combine (X86ISD:MOVMSK(SIGN_EXTEND(SETCC Vec1, allzeros, SETLT))) =>
+// (X86ISD:MOVMSK Vec1)
+static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
+                             const X86Subtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
+
+  if (N->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue N0 = N->getOperand(0);
+    if (N0.getOperand(0).getOpcode() == ISD::SETCC) {
+      SDValue N00 = N0.getOperand(0);
+      SDValue LHS = N00.getOperand(0);
+      SDValue RHS = N00.getOperand(1);
+      ISD::CondCode CC = cast<CondCodeSDNode>(N00.getOperand(2))->get();
+      if (isNullConstant(RHS.getOperand(0)) && CC == ISD::SETLT)
+        return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, LHS);
+    }
+  }
+  return SDValue();
+}
 
 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
@@ -37023,6 +37075,7 @@
   case X86ISD::TESTM:       return combineTestM(N, DAG, Subtarget);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
+  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, Subtarget);
   }
 
   return SDValue();
Index: lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- lib/Target/X86/X86IntrinsicsInfo.h
+++ lib/Target/X86/X86IntrinsicsInfo.h
@@ -385,8 +385,6 @@
   X86_INTRINSIC_DATA(avx_max_ps_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx_min_pd_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx_min_ps_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
-  X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(avx_rcp_ps_256,    INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(avx_round_pd_256,  ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx_round_ps_256,  ROUNDP, X86ISD::VRNDSCALE, 0),
@@ -411,7 +409,6 @@
   X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
   X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
-  X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
   X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
@@ -1597,7 +1594,6 @@
   X86_INTRINSIC_DATA(sse_max_ss,        INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(sse_min_ss,        INTR_TYPE_2OP, X86ISD::FMINS, 0),
-  X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
   X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
@@ -1623,7 +1619,6 @@
   X86_INTRINSIC_DATA(sse2_max_sd,       INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(sse2_min_sd,       INTR_TYPE_2OP, X86ISD::FMINS, 0),
-  X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
@@ -1632,7 +1627,6 @@
   X86_INTRINSIC_DATA(sse2_paddus_b,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(sse2_paddus_w,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
-  X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2319,12 +2319,6 @@
   }
 
   case Intrinsic::x86_mmx_pmovmskb:
-  case Intrinsic::x86_sse_movmsk_ps:
-  case Intrinsic::x86_sse2_movmsk_pd:
-  case Intrinsic::x86_sse2_pmovmskb_128:
-  case Intrinsic::x86_avx_movmsk_pd_256:
-  case Intrinsic::x86_avx_movmsk_ps_256:
-  case Intrinsic::x86_avx2_pmovmskb:
     if (Value *V = simplifyX86movmsk(*II))
       return replaceInstUsesWith(*II, V);
     break;
Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -638,12 +638,7 @@
         break;
       }
       case Intrinsic::x86_mmx_pmovmskb:
-      case Intrinsic::x86_sse_movmsk_ps:
-      case Intrinsic::x86_sse2_movmsk_pd:
-      case Intrinsic::x86_sse2_pmovmskb_128:
-      case Intrinsic::x86_avx_movmsk_ps_256:
-      case Intrinsic::x86_avx_movmsk_pd_256:
-      case Intrinsic::x86_avx2_pmovmskb: {
+      {
         // MOVMSK copies the vector elements' sign bits to the low bits
         // and zeros the high bits.
         unsigned ArgWidth;
Index: test/CodeGen/X86/avx-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -5,73 +5,48 @@
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
 
 define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_add_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_add_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_add_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fadd <4 x double> %a0, %a1
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_add_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_add_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_add_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fadd <8 x float> %a0, %a1
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_addsub_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_addsub_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_addsub_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_addsub_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_addsub_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_addsub_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_and_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_and_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_and_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %res = and <4 x i64> %1, %2
@@ -80,15 +55,10 @@
 }
 
 define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_and_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_and_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_and_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <8 x i32>
   %2 = bitcast <8 x float> %a1 to <8 x i32>
   %res = and <8 x i32> %1, %2
@@ -97,21 +67,13 @@
 }
 
 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_andnot_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X32-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; X32-NEXT:    vxorps %ymm2, %ymm0, %ymm0
-; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_andnot_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; X64-NEXT:    vxorps %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_andnot_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; ALL-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -121,15 +83,10 @@
 }
 
 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_andnot_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vandnps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_andnot_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vandnps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_andnot_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vandnps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <8 x i32>
   %2 = bitcast <8 x float> %a1 to <8 x i32>
   %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
@@ -139,58 +96,38 @@
 }
 
 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_blend_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_blend_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_blend_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_blend_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_blend_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_blend_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
-; X32-LABEL: test_mm256_blendv_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_blendv_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_blendv_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
-; X32-LABEL: test_mm256_blendv_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_blendv_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_blendv_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
@@ -201,12 +138,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_broadcast_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %ld = load <2 x double>, <2 x double>* %a0
   %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x double> %res
@@ -217,12 +154,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_broadcast_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %ld = load <4 x float>, <4 x float>* %a0
   %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <8 x float> %res
@@ -233,12 +170,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vbroadcastsd (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_broadcast_sd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %ld = load double, double* %a0
   %ins0 = insertelement <4 x double> undef, double %ld, i32 0
   %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
@@ -252,12 +189,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vbroadcastss (%eax), %xmm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm_broadcast_ss:
 ; X64:       # BB#0:
 ; X64-NEXT:    vbroadcastss (%rdi), %xmm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %ld = load float, float* %a0
   %ins0 = insertelement <4 x float> undef, float %ld, i32 0
   %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
@@ -271,12 +208,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vbroadcastss (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_broadcast_ss:
 ; X64:       # BB#0:
 ; X64-NEXT:    vbroadcastss (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %ld = load float, float* %a0
   %ins0 = insertelement <8 x float> undef, float %ld, i32 0
   %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
@@ -290,312 +227,205 @@
 }
 
 define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_castpd_ps:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castpd_ps:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castpd_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   %res = bitcast <4 x double> %a0 to <8 x float>
   ret <8 x float> %res
 }
 
 define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_castpd_si256:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castpd_si256:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castpd_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   %res = bitcast <4 x double> %a0 to <4 x i64>
   ret <4 x i64> %res
 }
 
 define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_castpd128_pd256:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castpd128_pd256:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castpd128_pd256:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x double> %res
 }
 
 define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_castpd256_pd128:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castpd256_pd128:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castpd256_pd128:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
 }
 
 define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_castps_pd:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castps_pd:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castps_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   %res = bitcast <8 x float> %a0 to <4 x double>
   ret <4 x double> %res
 }
 
 define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_castps_si256:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castps_si256:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castps_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   %res = bitcast <8 x float> %a0 to <4 x i64>
   ret <4 x i64> %res
 }
 
 define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_castps128_ps256:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castps128_ps256:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castps128_ps256:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x float> %res
 }
 
 define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_castps256_ps128:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castps256_ps128:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castps256_ps128:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_castsi128_si256:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castsi128_si256:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castsi128_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i64> %res
 }
 
 define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_castsi256_pd:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castsi256_pd:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castsi256_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   %res = bitcast <4 x i64> %a0 to <4 x double>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_castsi256_ps:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castsi256_ps:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castsi256_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   %res = bitcast <4 x i64> %a0 to <8 x float>
   ret <8 x float> %res
 }
 
 define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_castsi256_si128:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_castsi256_si128:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_castsi256_si128:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
 }
 
 define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_ceil_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vroundpd $2, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_ceil_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vroundpd $2, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_ceil_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vroundpd $2, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
 
 define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_ceil_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vroundps $2, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_ceil_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vroundps $2, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_ceil_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vroundps $2, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 
 define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm_cmp_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmp_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmp_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_cmp_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cmp_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cmp_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
 
 define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmp_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmp_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmp_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_cmp_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cmp_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cmp_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm_cmp_sd:
-; X32:       # BB#0:
-; X32-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmp_sd:
-; X64:       # BB#0:
-; X64-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmp_sd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmp_ss:
-; X32:       # BB#0:
-; X32-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmp_ss:
-; X64:       # BB#0:
-; X64-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmp_ss:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtepi32_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtepi32_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvtepi32_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %res = sitofp <4 x i32> %arg0 to <4 x double>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtepi32_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtepi32_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvtepi32_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   %res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %arg0)
   ret <8 x float> %res
@@ -603,17 +433,11 @@
 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtpd_epi32:
-; X32:       # BB#0:
-; X32-NEXT:    vcvtpd2dq %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtpd_epi32:
-; X64:       # BB#0:
-; X64-NEXT:    vcvtpd2dq %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvtpd_epi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtpd2dq %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
   %res = bitcast <4 x i32> %cvt to <2 x i64>
   ret <2 x i64> %res
@@ -621,32 +445,21 @@
 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
 
 define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtpd_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtpd_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvtpd_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
 
 define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtps_epi32:
-; X32:       # BB#0:
-; X32-NEXT:    vcvtps2dq %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtps_epi32:
-; X64:       # BB#0:
-; X64-NEXT:    vcvtps2dq %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvtps_epi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtps2dq %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
   %res = bitcast <8 x i32> %cvt to <4 x i64>
   ret <4 x i64> %res
@@ -654,31 +467,20 @@
 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtps_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vcvtps2pd %xmm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtps_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vcvtps2pd %xmm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvtps_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtps2pd %xmm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fpext <4 x float> %a0 to <4 x double>
   ret <4 x double> %res
 }
 
 define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_cvttpd_epi32:
-; X32:       # BB#0:
-; X32-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvttpd_epi32:
-; X64:       # BB#0:
-; X64-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvttpd_epi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
   %res = bitcast <4 x i32> %cvt to <2 x i64>
   ret <2 x i64> %res
@@ -686,15 +488,10 @@
 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
 
 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_cvttps_epi32:
-; X32:       # BB#0:
-; X32-NEXT:    vcvttps2dq %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvttps_epi32:
-; X64:       # BB#0:
-; X64-NEXT:    vcvttps2dq %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cvttps_epi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
   %res = bitcast <8 x i32> %cvt to <4 x i64>
   ret <4 x i64> %res
@@ -702,64 +499,41 @@
 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_div_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_div_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_div_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fdiv <4 x double> %a0, %a1
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_div_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vdivps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_div_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vdivps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_div_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vdivps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fdiv <8 x float> %a0, %a1
   ret <8 x float> %res
 }
 
 define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_dp_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_dp_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_dp_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_extract_epi8:
-; X32:       # BB#0:
-; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT:    vpextrb $15, %xmm0, %eax
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_extract_epi8:
-; X64:       # BB#0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vpextrb $15, %xmm0, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_extract_epi8:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vpextrb $15, %xmm0, %eax
+; ALL-NEXT:    movzbl %al, %eax
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %ext = extractelement <32 x i8> %arg0, i32 31
   %res = zext i8 %ext to i32
@@ -767,21 +541,13 @@
 }
 
 define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_extract_epi16:
-; X32:       # BB#0:
-; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT:    vpextrw $3, %xmm0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_extract_epi16:
-; X64:       # BB#0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vpextrw $3, %xmm0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_extract_epi16:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vpextrw $3, %xmm0, %eax
+; ALL-NEXT:    movzwl %ax, %eax
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %ext = extractelement <16 x i16> %arg0, i32 11
   %res = zext i16 %ext to i32
@@ -789,19 +555,12 @@
 }
 
 define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_extract_epi32:
-; X32:       # BB#0:
-; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT:    vextractps $1, %xmm0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_extract_epi32:
-; X64:       # BB#0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vextractps $1, %xmm0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_extract_epi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vextractps $1, %xmm0, %eax
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   %res = extractelement <8 x i32> %arg0, i32 5
   ret i32 %res
@@ -814,149 +573,101 @@
 ; X32-NEXT:    vextractps $2, %xmm0, %eax
 ; X32-NEXT:    vextractps $3, %xmm0, %edx
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_extract_epi64:
 ; X64:       # BB#0:
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X64-NEXT:    vpextrq $1, %xmm0, %rax
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res = extractelement <4 x i64> %a0, i32 3
   ret i64 %res
 }
 
 define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_extractf128_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_extractf128_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_extractf128_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
   ret <2 x double> %res
 }
 
 define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_extractf128_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_extractf128_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_extractf128_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   ret <4 x float> %res
 }
 
 define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_extractf128_si256:
-; X32:       # BB#0:
-; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_extractf128_si256:
-; X64:       # BB#0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_extractf128_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
   ret <2 x i64> %res
 }
 
 define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_floor_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vroundpd $1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_floor_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vroundpd $1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_floor_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vroundpd $1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_floor_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vroundps $1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_floor_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vroundps $1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_floor_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vroundps $1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_hadd_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_hadd_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_hadd_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_hadd_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_hadd_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_hadd_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_hsub_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_hsub_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_hsub_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_hsub_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_hsub_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_hsub_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret <8 x float> %res
 }
@@ -968,14 +679,14 @@
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
 ; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_insert_epi8:
 ; X64:       # BB#0:
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
   %bc = bitcast <32 x i8> %res to <4 x i64>
@@ -989,14 +700,14 @@
 ; X32-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_insert_epi16:
 ; X64:       # BB#0:
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    vpinsrw $6, %edi, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
   %bc = bitcast <16 x i16> %res to <4 x i64>
@@ -1008,13 +719,13 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
 ; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_insert_epi32:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
   %bc = bitcast <8 x i32> %res to <4 x i64>
@@ -1028,62 +739,45 @@
 ; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_insert_epi64:
 ; X64:       # BB#0:
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
   ret <4 x i64> %res
 }
 
 define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_insertf128_pd:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_insertf128_pd:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_insertf128_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_insertf128_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_insertf128_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_insertf128_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
 
 define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_insertf128_si256:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_insertf128_si256:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_insertf128_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x i64> %res
@@ -1094,12 +788,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vlddqu (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_lddqu_si256:
 ; X64:       # BB#0:
 ; X64-NEXT:    vlddqu (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64>* %a0 to i8*
   %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
   %bc = bitcast <32 x i8> %res to <4 x i64>
@@ -1112,12 +806,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_load_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <4 x double>*
   %res = load <4 x double>, <4 x double>* %arg0, align 32
   ret <4 x double> %res
@@ -1128,12 +822,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_load_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <8 x float>*
   %res = load <8 x float>, <8 x float>* %arg0, align 32
   ret <8 x float> %res
@@ -1144,12 +838,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_load_si256:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res = load <4 x i64>, <4 x i64>* %a0, align 32
   ret <4 x i64> %res
 }
@@ -1159,12 +853,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_loadu_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <4 x double>*
   %res = load <4 x double>, <4 x double>* %arg0, align 1
   ret <4 x double> %res
@@ -1175,12 +869,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_loadu_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <8 x float>*
   %res = load <8 x float>, <8 x float>* %arg0, align 1
   ret <8 x float> %res
@@ -1191,12 +885,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups (%eax), %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_loadu_si256:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups (%rdi), %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res = load <4 x i64>, <4 x i64>* %a0, align 1
   ret <4 x i64> %res
 }
@@ -1208,13 +902,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vmovups (%eax), %xmm0
 ; X32-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_loadu2_m128:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups (%rsi), %xmm0
 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <4 x float>*
   %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
   %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1232,13 +926,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vmovups (%eax), %xmm0
 ; X32-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_loadu2_m128d:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups (%rsi), %xmm0
 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <2 x double>*
   %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
   %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1256,13 +950,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vmovups (%eax), %xmm0
 ; X32-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_loadu2_m128i:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups (%rsi), %xmm0
 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast i64* %a0 to <2 x i64>*
   %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
   %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1278,12 +972,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm_maskload_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to i8*
   %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
   ret <2 x double> %res
@@ -1295,12 +989,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_maskload_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to i8*
   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
   ret <4 x double> %res
@@ -1312,12 +1006,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm_maskload_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
@@ -1330,12 +1024,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_maskload_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to i8*
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
@@ -1348,12 +1042,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm_maskstore_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to i8*
   call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
   ret void
@@ -1366,13 +1060,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_maskstore_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to i8*
   call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
   ret void
@@ -1384,12 +1078,12 @@
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm_maskstore_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
@@ -1403,13 +1097,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_maskstore_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to i8*
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
@@ -1418,179 +1112,123 @@
 declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_max_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_max_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_max_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_max_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_max_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_max_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_min_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vminpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_min_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vminpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_min_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vminpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_min_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vminps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_min_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vminps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_min_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vminps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_movedup_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_movedup_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_movedup_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_movehdup_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_movehdup_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_movehdup_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   ret <8 x float> %res
 }
 
 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_moveldup_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_moveldup_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_moveldup_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   ret <8 x float> %res
 }
 
 define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_movemask_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vmovmskpd %ymm0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_movemask_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vmovmskpd %ymm0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_movemask_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovmskpd %ymm0, %eax
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
 define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_movemask_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vmovmskps %ymm0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_movemask_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vmovmskps %ymm0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_movemask_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
+; ALL-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vmovmskps %ymm0, %eax
+; ALL-NEXT:    movzbl %al, %eax
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_mul_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_mul_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_mul_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fmul <4 x double> %a0, %a1
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_mul_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_mul_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_mul_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fmul <8 x float> %a0, %a1
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_or_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_or_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_or_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %res = or <4 x i64> %1, %2
@@ -1599,15 +1237,10 @@
 }
 
 define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_or_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_or_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_or_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <8 x i32>
   %2 = bitcast <8 x float> %a1 to <8 x i32>
   %res = or <8 x i32> %1, %2
@@ -1616,85 +1249,55 @@
 }
 
 define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
-; X32-LABEL: test_mm_permute_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permute_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_permute_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
   ret <2 x double> %res
 }
 
 define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_permute_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   ret <4 x double> %res
 }
 
 define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_permute_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permute_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_permute_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %res
 }
 
 define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
-; X32-LABEL: test2_mm_permute_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test2_mm_permute_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; X64-NEXT:    retq
+; ALL-LABEL: test2_mm_permute_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_permute_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_permute2f128_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute2f128_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute2f128_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x double> %res
 }
@@ -1702,30 +1305,20 @@
 
 ; PR26667
 define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_permute2f128_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vmovaps %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute2f128_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vmovaps %ymm1, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute2f128_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps %ymm1, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_permute2f128_si256:
-; X32:       # BB#0:
-; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute2f128_si256:
-; X64:       # BB#0:
-; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute2f128_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <4 x i64> %a0 to <8 x i32>
   %2 = bitcast <4 x i64> %a1 to <8 x i32>
   %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1735,45 +1328,30 @@
 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
 
 define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm_permutevar_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permutevar_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_permutevar_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
 
 define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_permutevar_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permutevar_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permutevar_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
 
 define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm_permutevar_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permutevar_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_permutevar_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
   ret <4 x float> %res
@@ -1781,15 +1359,10 @@
 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
 
 define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_permutevar_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permutevar_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permutevar_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
   ret <8 x float> %res
@@ -1797,58 +1370,38 @@
 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
 
 define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_rcp_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vrcpps %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_rcp_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vrcpps %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_rcp_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vrcpps %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_round_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vroundpd $4, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_round_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vroundpd $4, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_round_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vroundpd $4, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_round_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vroundps $4, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_round_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vroundps $4, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_round_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vroundps $4, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
   ret <8 x float> %res
 }
 
 define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_rsqrt_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vrsqrtps %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_rsqrt_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vrsqrtps %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_rsqrt_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vrsqrtps %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   ret <8 x float> %res
 }
@@ -1922,7 +1475,7 @@
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set_epi8:
 ; X64:       # BB#0:
@@ -1991,7 +1544,7 @@
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0  = insertelement <32 x i8> undef,  i8 %a31, i32 0
   %res1  = insertelement <32 x i8> %res0,  i8 %a30, i32 1
   %res2  = insertelement <32 x i8> %res1,  i8 %a29, i32 2
@@ -2064,7 +1617,7 @@
 ; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set_epi16:
 ; X64:       # BB#0:
@@ -2095,7 +1648,7 @@
 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0  = insertelement <16 x i16> undef,  i16 %a15, i32 0
   %res1  = insertelement <16 x i16> %res0,  i16 %a14, i32 1
   %res2  = insertelement <16 x i16> %res1,  i16 %a13, i32 2
@@ -2128,7 +1681,7 @@
 ; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set_epi32:
 ; X64:       # BB#0:
@@ -2141,7 +1694,7 @@
 ; X64-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
 ; X64-NEXT:    vpinsrd $3, %r8d, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
   %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
   %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
@@ -2166,7 +1719,7 @@
 ; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set_epi64x:
 ; X64:       # BB#0:
@@ -2177,7 +1730,7 @@
 ; X64-NEXT:    vmovq %rcx, %xmm2
 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
   %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
   %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
@@ -2186,33 +1739,21 @@
 }
 
 define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_set_m128:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_set_m128:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_set_m128:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_set_m128d:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_set_m128d:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_set_m128d:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x double> %a0 to <4 x float>
   %arg1 = bitcast <2 x double> %a1 to <4 x float>
   %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2221,17 +1762,11 @@
 }
 
 define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_set_m128i:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_set_m128i:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_set_m128i:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x float>
   %arg1 = bitcast <2 x i64> %a1 to <4 x float>
   %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2249,14 +1784,14 @@
 ; X32-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X32-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <4 x double> undef, double %a3, i32 0
   %res1 = insertelement <4 x double> %res0, double %a2, i32 1
   %res2 = insertelement <4 x double> %res1, double %a1, i32 2
@@ -2282,7 +1817,7 @@
 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set_ps:
 ; X64:       # BB#0:
@@ -2293,7 +1828,7 @@
 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <8 x float> undef, float %a7, i32 0
   %res1 = insertelement <8 x float> %res0, float %a6, i32 1
   %res2 = insertelement <8 x float> %res1, float %a5, i32 2
@@ -2313,7 +1848,7 @@
 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set1_epi8:
 ; X64:       # BB#0:
@@ -2322,7 +1857,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0  = insertelement <32 x i8> undef,  i8 %a0, i32 0
   %res1  = insertelement <32 x i8> %res0,  i8 %a0, i32 1
   %res2  = insertelement <32 x i8> %res1,  i8 %a0, i32 2
@@ -2367,7 +1902,7 @@
 ; X32-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set1_epi16:
 ; X64:       # BB#0:
@@ -2375,7 +1910,7 @@
 ; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0  = insertelement <16 x i16> undef,  i16 %a0, i32 0
   %res1  = insertelement <16 x i16> %res0,  i16 %a0, i32 1
   %res2  = insertelement <16 x i16> %res1,  i16 %a0, i32 2
@@ -2402,14 +1937,14 @@
 ; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set1_epi32:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovd %edi, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
   %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
   %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
@@ -2432,14 +1967,14 @@
 ; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
 ; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set1_epi64x:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovq %rdi, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
   %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
   %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
@@ -2453,13 +1988,13 @@
 ; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set1_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <4 x double> undef, double %a0, i32 0
   %res1 = insertelement <4 x double> %res0, double %a0, i32 1
   %res2 = insertelement <4 x double> %res1, double %a0, i32 2
@@ -2473,13 +2008,13 @@
 ; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_set1_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <8 x float> undef, float %a0, i32 0
   %res1 = insertelement <8 x float> %res0, float %a0, i32 1
   %res2 = insertelement <8 x float> %res1, float %a0, i32 2
@@ -2559,7 +2094,7 @@
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_setr_epi8:
 ; X64:       # BB#0:
@@ -2628,7 +2163,7 @@
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0  = insertelement <32 x i8> undef,  i8 %a0 , i32 0
   %res1  = insertelement <32 x i8> %res0,  i8 %a1 , i32 1
   %res2  = insertelement <32 x i8> %res1,  i8 %a2 , i32 2
@@ -2701,7 +2236,7 @@
 ; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_setr_epi16:
 ; X64:       # BB#0:
@@ -2732,7 +2267,7 @@
 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0  = insertelement <16 x i16> undef,  i16 %a0 , i32 0
   %res1  = insertelement <16 x i16> %res0,  i16 %a1 , i32 1
   %res2  = insertelement <16 x i16> %res1,  i16 %a2 , i32 2
@@ -2765,7 +2300,7 @@
 ; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_setr_epi32:
 ; X64:       # BB#0:
@@ -2778,7 +2313,7 @@
 ; X64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
 ; X64-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
   %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
   %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
@@ -2803,7 +2338,7 @@
 ; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_setr_epi64x:
 ; X64:       # BB#0:
@@ -2814,7 +2349,7 @@
 ; X64-NEXT:    vmovq %rdi, %xmm2
 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
   %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
   %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
@@ -2823,33 +2358,21 @@
 }
 
 define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_setr_m128:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_setr_m128:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_setr_m128:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_setr_m128d:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_setr_m128d:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_setr_m128d:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x double> %a0 to <4 x float>
   %arg1 = bitcast <2 x double> %a1 to <4 x float>
   %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2858,17 +2381,11 @@
 }
 
 define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_setr_m128i:
-; X32:       # BB#0:
-; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_setr_m128i:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_setr_m128i:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x float>
   %arg1 = bitcast <2 x i64> %a1 to <4 x float>
   %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2886,14 +2403,14 @@
 ; X32-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X32-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_setr_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <4 x double> undef, double %a0, i32 0
   %res1 = insertelement <4 x double> %res0, double %a1, i32 1
   %res2 = insertelement <4 x double> %res1, double %a2, i32 2
@@ -2919,7 +2436,7 @@
 ; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_setr_ps:
 ; X64:       # BB#0:
@@ -2930,7 +2447,7 @@
 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %res0 = insertelement <8 x float> undef, float %a0, i32 0
   %res1 = insertelement <8 x float> %res0, float %a1, i32 1
   %res2 = insertelement <8 x float> %res1, float %a2, i32 2
@@ -2943,97 +2460,62 @@
 }
 
 define <4 x double> @test_mm256_setzero_pd() nounwind {
-; X32-LABEL: test_mm256_setzero_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_setzero_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_setzero_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   ret <4 x double> zeroinitializer
 }
 
 define <8 x float> @test_mm256_setzero_ps() nounwind {
-; X32-LABEL: test_mm256_setzero_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_setzero_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_setzero_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   ret <8 x float> zeroinitializer
 }
 
 define <4 x i64> @test_mm256_setzero_si256() nounwind {
-; X32-LABEL: test_mm256_setzero_si256:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_setzero_si256:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_setzero_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   ret <4 x i64> zeroinitializer
 }
 
 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_shuffle_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_shuffle_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_shuffle_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_shuffle_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_shuffle_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_shuffle_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_sqrt_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vsqrtpd %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_sqrt_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vsqrtpd %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_sqrt_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vsqrtpd %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_sqrt_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vsqrtps %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_sqrt_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vsqrtps %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_sqrt_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vsqrtps %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
   ret <8 x float> %res
 }
@@ -3045,13 +2527,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_store_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <4 x double>*
   store <4 x double> %a1, <4 x double>* %arg0, align 32
   ret void
@@ -3063,13 +2545,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_store_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <8 x float>*
   store <8 x float> %a1, <8 x float>* %arg0, align 32
   ret void
@@ -3081,13 +2563,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_store_si256:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   store <4 x i64> %a1, <4 x i64>* %a0, align 32
   ret void
 }
@@ -3098,13 +2580,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_storeu_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <4 x double>*
   store <4 x double> %a1, <4 x double>* %arg0, align 1
   ret void
@@ -3116,13 +2598,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_storeu_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <8 x float>*
   store <8 x float> %a1, <8 x float>* %arg0, align 1
   ret void
@@ -3134,13 +2616,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_storeu_si256:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   store <4 x i64> %a1, <4 x i64>* %a0, align 1
   ret void
 }
@@ -3154,7 +2636,7 @@
 ; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_storeu2_m128:
 ; X64:       # BB#0:
@@ -3162,7 +2644,7 @@
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X64-NEXT:    vmovups %xmm0, (%rsi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <4 x float>*
   %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x float> %lo, <4 x float>* %arg0, align 1
@@ -3181,7 +2663,7 @@
 ; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_storeu2_m128d:
 ; X64:       # BB#0:
@@ -3189,7 +2671,7 @@
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X64-NEXT:    vmovups %xmm0, (%rsi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <2 x double>*
   %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
   store <2 x double> %lo, <2 x double>* %arg0, align 1
@@ -3208,7 +2690,7 @@
 ; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_storeu2_m128i:
 ; X64:       # BB#0:
@@ -3216,7 +2698,7 @@
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X64-NEXT:    vmovups %xmm0, (%rsi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
   %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
   store <2 x i64> %lo, <2 x i64>* %arg0, align 1
@@ -3232,13 +2714,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovntps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_stream_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast double* %a0 to <4 x double>*
   store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
   ret void
@@ -3250,13 +2732,13 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovntps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_stream_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast float* %a0 to <8 x float>*
   store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
   ret void
@@ -3268,458 +2750,293 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovntps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: test_mm256_stream_si256:
 ; X64:       # BB#0:
 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
   ret void
 }
 
 define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_sub_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_sub_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_sub_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fsub <4 x double> %a0, %a1
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_sub_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vsubps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_sub_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vsubps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_sub_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vsubps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = fsub <8 x float> %a0, %a1
   ret <8 x float> %res
 }
 
 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm_testc_pd:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestpd %xmm1, %xmm0
-; X32-NEXT:    setb %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testc_pd:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestpd %xmm1, %xmm0
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_testc_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestpd %xmm1, %xmm0
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
 
 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_testc_pd:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestpd %ymm1, %ymm0
-; X32-NEXT:    setb %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testc_pd:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestpd %ymm1, %ymm0
-; X64-NEXT:    setb %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testc_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestpd %ymm1, %ymm0
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_testc_ps:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestps %xmm1, %xmm0
-; X32-NEXT:    setb %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testc_ps:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestps %xmm1, %xmm0
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_testc_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestps %xmm1, %xmm0
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_testc_ps:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestps %ymm1, %ymm0
-; X32-NEXT:    setb %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testc_ps:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestps %ymm1, %ymm0
-; X64-NEXT:    setb %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testc_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestps %ymm1, %ymm0
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_testc_si256:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vptest %ymm1, %ymm0
-; X32-NEXT:    setb %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testc_si256:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vptest %ymm1, %ymm0
-; X64-NEXT:    setb %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testc_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vptest %ymm1, %ymm0
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm_testnzc_pd:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestpd %xmm1, %xmm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testnzc_pd:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestpd %xmm1, %xmm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_testnzc_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestpd %xmm1, %xmm0
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
 
 define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_testnzc_pd:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestpd %ymm1, %ymm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testnzc_pd:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestpd %ymm1, %ymm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testnzc_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestpd %ymm1, %ymm0
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_testnzc_ps:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestps %xmm1, %xmm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testnzc_ps:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestps %xmm1, %xmm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_testnzc_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestps %xmm1, %xmm0
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_testnzc_ps:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestps %ymm1, %ymm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testnzc_ps:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestps %ymm1, %ymm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testnzc_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestps %ymm1, %ymm0
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_testnzc_si256:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vptest %ymm1, %ymm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testnzc_si256:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vptest %ymm1, %ymm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testnzc_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vptest %ymm1, %ymm0
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
-; X32-LABEL: test_mm_testz_pd:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestpd %xmm1, %xmm0
-; X32-NEXT:    sete %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testz_pd:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestpd %xmm1, %xmm0
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_testz_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestpd %xmm1, %xmm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
 
 define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_testz_pd:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestpd %ymm1, %ymm0
-; X32-NEXT:    sete %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testz_pd:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestpd %ymm1, %ymm0
-; X64-NEXT:    sete %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testz_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestpd %ymm1, %ymm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
 define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_testz_ps:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestps %xmm1, %xmm0
-; X32-NEXT:    sete %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testz_ps:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestps %xmm1, %xmm0
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_testz_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestps %xmm1, %xmm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_testz_ps:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vtestps %ymm1, %ymm0
-; X32-NEXT:    sete %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testz_ps:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vtestps %ymm1, %ymm0
-; X64-NEXT:    sete %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testz_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vtestps %ymm1, %ymm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
 define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_testz_si256:
-; X32:       # BB#0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    vptest %ymm1, %ymm0
-; X32-NEXT:    sete %al
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testz_si256:
-; X64:       # BB#0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    vptest %ymm1, %ymm0
-; X64-NEXT:    sete %al
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_testz_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    vptest %ymm1, %ymm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 define <2 x double> @test_mm_undefined_pd() nounwind {
-; X32-LABEL: test_mm_undefined_pd:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_undefined_pd:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_undefined_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   ret <2 x double> undef
 }
 
 define <4 x double> @test_mm256_undefined_pd() nounwind {
-; X32-LABEL: test_mm256_undefined_pd:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_undefined_pd:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_undefined_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   ret <4 x double> undef
 }
 
 define <8 x float> @test_mm256_undefined_ps() nounwind {
-; X32-LABEL: test_mm256_undefined_ps:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_undefined_ps:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_undefined_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   ret <8 x float> undef
 }
 
 define <4 x i64> @test_mm256_undefined_si256() nounwind {
-; X32-LABEL: test_mm256_undefined_si256:
-; X32:       # BB#0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_undefined_si256:
-; X64:       # BB#0:
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_undefined_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    ret{{[l|q]}}
   ret <4 x i64> undef
 }
 
 define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_unpackhi_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_unpackhi_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_unpackhi_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_unpackhi_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_unpackhi_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_unpackhi_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_unpacklo_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_unpacklo_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_unpacklo_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_unpacklo_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_unpacklo_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_unpacklo_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
 
 define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
-; X32-LABEL: test_mm256_xor_pd:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_xor_pd:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_xor_pd:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %res = xor <4 x i64> %1, %2
@@ -3728,15 +3045,10 @@
 }
 
 define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
-; X32-LABEL: test_mm256_xor_ps:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_xor_ps:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_xor_ps:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <8 x i32>
   %2 = bitcast <8 x float> %a1 to <8 x i32>
   %res = xor <8 x i32> %1, %2
@@ -3745,73 +3057,48 @@
 }
 
 define void @test_mm256_zeroall() nounwind {
-; X32-LABEL: test_mm256_zeroall:
-; X32:       # BB#0:
-; X32-NEXT:    vzeroall
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_zeroall:
-; X64:       # BB#0:
-; X64-NEXT:    vzeroall
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_zeroall:
+; ALL:       # BB#0:
+; ALL-NEXT:    vzeroall
+; ALL-NEXT:    ret{{[l|q]}}
   call void @llvm.x86.avx.vzeroall()
   ret void
 }
 declare void @llvm.x86.avx.vzeroall() nounwind readnone
 
 define void @test_mm256_zeroupper() nounwind {
-; X32-LABEL: test_mm256_zeroupper:
-; X32:       # BB#0:
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_zeroupper:
-; X64:       # BB#0:
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_zeroupper:
+; ALL:       # BB#0:
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    ret{{[l|q]}}
   call void @llvm.x86.avx.vzeroupper()
   ret void
 }
 declare void @llvm.x86.avx.vzeroupper() nounwind readnone
 
 define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
-; X32-LABEL: test_mm256_zextpd128_pd256:
-; X32:       # BB#0:
-; X32-NEXT:    vmovaps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_zextpd128_pd256:
-; X64:       # BB#0:
-; X64-NEXT:    vmovaps %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_zextpd128_pd256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x double> %res
 }
 
 define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_zextps128_ps256:
-; X32:       # BB#0:
-; X32-NEXT:    vmovaps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_zextps128_ps256:
-; X64:       # BB#0:
-; X64-NEXT:    vmovaps %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_zextps128_ps256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %res
 }
 
 define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_zextsi128_si256:
-; X32:       # BB#0:
-; X32-NEXT:    vmovaps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_zextsi128_si256:
-; X64:       # BB#0:
-; X64-NEXT:    vmovaps %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_zextsi128_si256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i64> %res
 }
Index: test/CodeGen/X86/broadcastm-lowering.ll
===================================================================
--- test/CodeGen/X86/broadcastm-lowering.ll
+++ test/CodeGen/X86/broadcastm-lowering.ll
@@ -43,29 +43,21 @@
 define <4 x i32> @test_mm_epi32(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512CD-LABEL: test_mm_epi32:
 ; AVX512CD:       # BB#0: # %entry
-; AVX512CD-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512CD-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512CD-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512CD-NEXT:    kmovw %k0, %eax
-; AVX512CD-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX512CD-NEXT:    vzeroupper
+; AVX512CD-NEXT:    vpmovmskb %xmm0, %eax
+; AVX512CD-NEXT:    vmovd %eax, %xmm0
+; AVX512CD-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VLCDBW-LABEL: test_mm_epi32:
 ; AVX512VLCDBW:       # BB#0: # %entry
-; AVX512VLCDBW-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; AVX512VLCDBW-NEXT:    vpbroadcastmw2d %k0, %xmm0
+; AVX512VLCDBW-NEXT:    vpmovmskb %xmm0, %eax
+; AVX512VLCDBW-NEXT:    vpbroadcastd %eax, %xmm0
 ; AVX512VLCDBW-NEXT:    retq
 ;
 ; X86-AVX512VLCDBW-LABEL: test_mm_epi32:
 ; X86-AVX512VLCDBW:       # BB#0: # %entry
-; X86-AVX512VLCDBW-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; X86-AVX512VLCDBW-NEXT:    vpbroadcastmw2d %k0, %xmm0
+; X86-AVX512VLCDBW-NEXT:    vpmovmskb %xmm0, %eax
+; X86-AVX512VLCDBW-NEXT:    vpbroadcastd %eax, %xmm0
 ; X86-AVX512VLCDBW-NEXT:    retl
 entry:
   %0 = icmp eq <16 x i8> %a, %b
Index: test/CodeGen/X86/movmsk.ll
===================================================================
--- test/CodeGen/X86/movmsk.ll
+++ test/CodeGen/X86/movmsk.ll
@@ -133,7 +133,7 @@
 define i32 @t2(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: t2:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    movmskpd %xmm0, %eax
+; CHECK-NEXT:    movmskps %xmm0, %eax
 ; CHECK-NEXT:    movl (%rdi,%rax,4), %eax
 ; CHECK-NEXT:    retq
 entry:
Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1826,11 +1826,13 @@
 ; X32-LABEL: test_mm_movemask_epi8:
 ; X32:       # BB#0:
 ; X32-NEXT:    pmovmskb %xmm0, %eax
+; X32-NEXT:    movzwl %ax, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_movemask_epi8:
 ; X64:       # BB#0:
 ; X64-NEXT:    pmovmskb %xmm0, %eax
+; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %arg0)
Index: test/Transforms/InstCombine/X86/x86-movmsk.ll
===================================================================
--- test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -9,7 +9,7 @@
 
 define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
 ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
@@ -19,8 +19,11 @@
 
 define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
 ; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A0:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
   %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
   %2 = and i32 %1, 15
@@ -29,8 +32,11 @@
 
 define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
 ; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[A0:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i1> [[TMP2]] to i2
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i2 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
   %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
   %2 = and i32 %1, 3
@@ -39,8 +45,10 @@
 
 define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
 ; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <16 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
   %2 = and i32 %1, 65535
@@ -49,8 +57,11 @@
 
 define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
   %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
   %2 = and i32 %1, 255
@@ -59,8 +70,11 @@
 
 define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[A0:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
   %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
   %2 = and i32 %1, 15