Index: test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -101,12 +101,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx_vbroadcastf128_pd_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
 }
@@ -118,12 +118,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx_vbroadcastf128_ps_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
@@ -402,14 +402,14 @@
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqu %xmm0, (%eax)
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_sse2_storeu_dq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqu %xmm0, (%rdi)
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
   ret void
@@ -426,7 +426,7 @@
 ; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovupd %xmm0, (%eax)
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_sse2_storeu_pd:
 ; X64:       # %bb.0:
@@ -434,7 +434,7 @@
 ; X64-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovupd %xmm0, (%rdi)
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
   call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
   ret void
@@ -447,12 +447,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %xmm0, (%eax)
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_sse_storeu_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovups %xmm0, (%rdi)
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
   ret void
 }
@@ -472,7 +472,7 @@
 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-NEXT:    vmovups %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx_storeu_dq_256:
 ; X64:       # %bb.0:
@@ -483,7 +483,7 @@
 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
   ret void
@@ -500,7 +500,7 @@
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vmovupd %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx_storeu_pd_256:
 ; X64:       # %bb.0:
@@ -508,7 +508,7 @@
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vmovupd %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
   call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
   ret void
@@ -522,13 +522,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx_storeu_ps_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
   ret void
 }
Index: test/CodeGen/X86/avx-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-x86.ll
+++ test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -287,12 +287,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vlddqu (%eax), %ymm0 # encoding: [0xc5,0xff,0xf0,0x00]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_ldu_dq_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vlddqu (%rdi), %ymm0 # encoding: [0xc5,0xff,0xf0,0x07]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -304,12 +304,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2d,0x00]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskload_pd:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2d,0x07]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -321,12 +321,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2d,0x00]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskload_pd_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2d,0x07]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
 }
@@ -338,12 +338,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2c,0x00]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskload_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2c,0x07]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -355,12 +355,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2c,0x00]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskload_ps_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2c,0x07]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
@@ -372,12 +372,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax) # encoding: [0xc4,0xe2,0x79,0x2f,0x08]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskstore_pd:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # encoding: [0xc4,0xe2,0x79,0x2f,0x0f]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2)
   ret void
 }
@@ -390,13 +390,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax) # encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskstore_pd_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # encoding: [0xc4,0xe2,0x7d,0x2f,0x0f]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
   ret void
 }
@@ -408,12 +408,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax) # encoding: [0xc4,0xe2,0x79,0x2e,0x08]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskstore_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # encoding: [0xc4,0xe2,0x79,0x2e,0x0f]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2)
   ret void
 }
@@ -426,13 +426,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax) # encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx_maskstore_ps_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # encoding: [0xc4,0xe2,0x7d,0x2e,0x0f]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
   ret void
 }
@@ -720,23 +720,23 @@
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vpermilps (%eax), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0c,0x00]
-; X86-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
 ; X86-AVX512VL:       # %bb.0:
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vpermilps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x00]
-; X86-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx_vpermilvar_ps_load:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0c,0x07]
-; X64-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
 ; X64-AVX512VL:       # %bb.0:
 ; X64-AVX512VL-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x07]
-; X64-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
   %a2 = load <4 x i32>, <4 x i32>* %a1
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -951,7 +951,7 @@
 ; X86-AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
 ; X86-AVX-NEXT:    vmovntdq %ymm0, (%eax) # encoding: [0xc5,0xfd,0xe7,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: movnt_dq:
 ; X86-AVX512VL:       # %bb.0:
@@ -960,7 +960,7 @@
 ; X86-AVX512VL-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
 ; X86-AVX512VL-NEXT:    vmovntdq %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: movnt_dq:
 ; X64-AVX:       # %bb.0:
@@ -968,7 +968,7 @@
 ; X64-AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
 ; X64-AVX-NEXT:    vmovntdq %ymm0, (%rdi) # encoding: [0xc5,0xfd,0xe7,0x07]
 ; X64-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: movnt_dq:
 ; X64-AVX512VL:       # %bb.0:
@@ -976,7 +976,7 @@
 ; X64-AVX512VL-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
 ; X64-AVX512VL-NEXT:    vmovntdq %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07]
 ; X64-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
   %a2 = add <2 x i64> %a1, <i64 1, i64 1>
   %a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind
@@ -990,26 +990,26 @@
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovntps %ymm0, (%eax) # encoding: [0xc5,0xfc,0x2b,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: movnt_ps:
 ; X86-AVX512VL:       # %bb.0:
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovntps %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: movnt_ps:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vmovntps %ymm0, (%rdi) # encoding: [0xc5,0xfc,0x2b,0x07]
 ; X64-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: movnt_ps:
 ; X64-AVX512VL:       # %bb.0:
 ; X64-AVX512VL-NEXT:    vmovntps %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x07]
 ; X64-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
   tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
   ret void
 }
@@ -1024,7 +1024,7 @@
 ; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1]
 ; X86-AVX-NEXT:    vmovntpd %ymm0, (%eax) # encoding: [0xc5,0xfd,0x2b,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: movnt_pd:
 ; X86-AVX512VL:       # %bb.0:
@@ -1033,7 +1033,7 @@
 ; X86-AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; X86-AVX512VL-NEXT:    vmovntpd %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X86-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: movnt_pd:
 ; X64-AVX:       # %bb.0:
@@ -1041,7 +1041,7 @@
 ; X64-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1]
 ; X64-AVX-NEXT:    vmovntpd %ymm0, (%rdi) # encoding: [0xc5,0xfd,0x2b,0x07]
 ; X64-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: movnt_pd:
 ; X64-AVX512VL:       # %bb.0:
@@ -1049,7 +1049,7 @@
 ; X64-AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; X64-AVX512VL-NEXT:    vmovntpd %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x07]
 ; X64-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
   %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
   tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
   ret void
Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -367,12 +367,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %a0 = load <2 x i64>, <2 x i64>* %p0
   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x i64> %res
@@ -766,7 +766,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i32gather_epi32:
 ; X64:       # %bb.0:
@@ -774,7 +774,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i32 *%a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
@@ -789,12 +789,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i32gather_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast i32 *%a1 to i8*
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -812,7 +812,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i32gather_epi32:
 ; X64:       # %bb.0:
@@ -820,7 +820,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i32 *%a0 to i8*
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
@@ -835,12 +835,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   %arg1 = bitcast i32 *%a1 to i8*
   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
@@ -858,7 +858,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i32gather_epi64:
 ; X64:       # %bb.0:
@@ -866,7 +866,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64 *%a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
@@ -879,12 +879,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i32gather_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast i64 *%a1 to i8*
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
@@ -899,7 +899,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i32gather_epi64:
 ; X64:       # %bb.0:
@@ -907,7 +907,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64 *%a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
@@ -920,12 +920,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast i64 *%a1 to i8*
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
@@ -940,7 +940,7 @@
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovapd %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i32gather_pd:
 ; X64:       # %bb.0:
@@ -948,7 +948,7 @@
 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovapd %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast double *%a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
@@ -964,12 +964,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i32gather_pd:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast double *%a1 to i8*
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
@@ -984,7 +984,7 @@
 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
 ; X86-NEXT:    vmovapd %ymm1, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i32gather_pd:
 ; X64:       # %bb.0:
@@ -992,7 +992,7 @@
 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
 ; X64-NEXT:    vmovapd %ymm1, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast double *%a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
@@ -1006,12 +1006,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i32gather_pd:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast double *%a1 to i8*
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
@@ -1026,7 +1026,7 @@
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i32gather_ps:
 ; X64:       # %bb.0:
@@ -1034,7 +1034,7 @@
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovaps %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast float *%a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
@@ -1050,12 +1050,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i32gather_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast float *%a1 to i8*
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
@@ -1070,7 +1070,7 @@
 ; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
 ; X86-NEXT:    vmovaps %ymm1, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i32gather_ps:
 ; X64:       # %bb.0:
@@ -1078,7 +1078,7 @@
 ; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
 ; X64-NEXT:    vmovaps %ymm1, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast float *%a0 to i8*
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
@@ -1092,12 +1092,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i32gather_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast float *%a1 to i8*
   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
@@ -1112,7 +1112,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i64gather_epi32:
 ; X64:       # %bb.0:
@@ -1120,7 +1120,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i32 *%a0 to i8*
   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
@@ -1134,12 +1134,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i64gather_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast i32 *%a1 to i8*
   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
@@ -1157,7 +1157,7 @@
 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i64gather_epi32:
 ; X64:       # %bb.0:
@@ -1166,7 +1166,7 @@
 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i32 *%a0 to i8*
   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
@@ -1181,13 +1181,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast i32 *%a1 to i8*
   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
@@ -1204,7 +1204,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i64gather_epi64:
 ; X64:       # %bb.0:
@@ -1212,7 +1212,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64 *%a0 to i8*
   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
   ret <2 x i64> %call
@@ -1224,12 +1224,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i64gather_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast i64 *%a1 to i8*
   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
   ret <2 x i64> %call
@@ -1243,7 +1243,7 @@
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i64gather_epi64:
 ; X64:       # %bb.0:
@@ -1251,7 +1251,7 @@
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64 *%a0 to i8*
   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
   ret <4 x i64> %call
@@ -1263,12 +1263,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast i64 *%a1 to i8*
   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
   ret <4 x i64> %call
@@ -1282,7 +1282,7 @@
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovapd %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i64gather_pd:
 ; X64:       # %bb.0:
@@ -1290,7 +1290,7 @@
 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovapd %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast double *%a0 to i8*
   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
   %sext = sext <2 x i1> %cmp to <2 x i64>
@@ -1305,12 +1305,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i64gather_pd:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast double *%a1 to i8*
   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
   ret <2 x double> %call
@@ -1324,7 +1324,7 @@
 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
 ; X86-NEXT:    vmovapd %ymm1, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i64gather_pd:
 ; X64:       # %bb.0:
@@ -1332,7 +1332,7 @@
 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
 ; X64-NEXT:    vmovapd %ymm1, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast double *%a0 to i8*
   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
@@ -1345,12 +1345,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i64gather_pd:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast i64 *%a1 to i8*
   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
   ret <4 x double> %call
@@ -1364,7 +1364,7 @@
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_i64gather_ps:
 ; X64:       # %bb.0:
@@ -1372,7 +1372,7 @@
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovaps %xmm1, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast float *%a0 to i8*
   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -1387,12 +1387,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_i64gather_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast float *%a1 to i8*
   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
   ret <4 x float> %call
@@ -1407,7 +1407,7 @@
 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_i64gather_ps:
 ; X64:       # %bb.0:
@@ -1416,7 +1416,7 @@
 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
 ; X64-NEXT:    vmovaps %xmm1, %xmm0
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast float *%a0 to i8*
   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -1432,13 +1432,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_i64gather_ps:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg1 = bitcast float *%a1 to i8*
   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
   ret <4 x float> %call
@@ -1496,12 +1496,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskload_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i32* %a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
@@ -1515,12 +1515,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskload_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i32* %a0 to i8*
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
@@ -1534,12 +1534,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskload_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64* %a0 to i8*
   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
   ret <2 x i64> %res
@@ -1551,12 +1551,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskload_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64* %a0 to i8*
   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
   ret <4 x i64> %res
@@ -1568,12 +1568,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskstore_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast float* %a0 to i8*
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -1588,13 +1588,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskstore_epi32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast float* %a0 to i8*
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
@@ -1608,12 +1608,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskstore_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64* %a0 to i8*
   call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
   ret void
@@ -1626,13 +1626,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskstore_epi64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast i64* %a0 to i8*
   call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
   ret void
@@ -2465,12 +2465,12 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovntdqa (%eax), %ymm0
-; X86-NEXT:    ret{{[l|q]}}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_stream_load_si256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovntdqa (%rdi), %ymm0
-; X64-NEXT:    ret{{[l|q]}}
+; X64-NEXT:    retq
   %arg0 = bitcast <4 x i64> *%a0 to i8*
   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
   ret <4 x i64> %res
Index: test/CodeGen/X86/packss.ll
===================================================================
--- test/CodeGen/X86/packss.ll
+++ test/CodeGen/X86/packss.ll
@@ -74,28 +74,28 @@
 ; X86-SSE-NEXT:    psrad $31, %xmm0
 ; X86-SSE-NEXT:    pcmpgtd {{\.LCPI.*}}, %xmm1
 ; X86-SSE-NEXT:    packssdw %xmm1, %xmm0
-; X86-SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: trunc_ashr_v4i32_icmp_v4i32:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpcmpgtd {{\.LCPI.*}}, %xmm1, %xmm1
 ; X86-AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    ret{{[l|q]}}
+; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: trunc_ashr_v4i32_icmp_v4i32:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    psrad $31, %xmm0
 ; X64-SSE-NEXT:    pcmpgtd {{.*}}(%rip), %xmm1
 ; X64-SSE-NEXT:    packssdw %xmm1, %xmm0
-; X64-SSE-NEXT:    ret{{[l|q]}}
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: trunc_ashr_v4i32_icmp_v4i32:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1
 ; X64-AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    ret{{[l|q]}}
+; X64-AVX-NEXT:    retq
   %1 = ashr <4 x i32> %a, <i32 31, i32 31, i32 31, i32 31>
   %2 = icmp sgt <4 x i32> %b, <i32 1, i32 16, i32 255, i32 65535>
   %3 = sext <4 x i1> %2 to <4 x i32>
Index: test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -195,34 +195,22 @@
 declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 define <2 x i64> @test_mm_cmpgt_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpgt_epi64:
-; X32:       # %bb.0:
-; X32-NEXT:    pcmpgtq %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpgt_epi64:
-; X64:       # %bb.0:
-; X64-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpgt_epi64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pcmpgtq %xmm1, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %cmp = icmp sgt <2 x i64> %a0, %a1
   %res = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %res
 }
 
 define i32 @test_mm_cmpistra(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistra:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistra:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistra:
+; ALL:       # %bb.0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    pcmpistri $7, %xmm1, %xmm0
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
@@ -231,19 +219,12 @@
 declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define i32 @test_mm_cmpistrc(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistrc:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    setb %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistrc:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistrc:
+; ALL:       # %bb.0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    pcmpistri $7, %xmm1, %xmm0
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
@@ -252,17 +233,11 @@
 declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define i32 @test_mm_cmpistri(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistri:
-; X32:       # %bb.0:
-; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistri:
-; X64:       # %bb.0:
-; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistri:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pcmpistri $7, %xmm1, %xmm0
+; ALL-NEXT:    movl %ecx, %eax
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
@@ -271,15 +246,10 @@
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_cmpistrm(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistrm:
-; X32:       # %bb.0:
-; X32-NEXT:    pcmpistrm $7, %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistrm:
-; X64:       # %bb.0:
-; X64-NEXT:    pcmpistrm $7, %xmm1, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistrm:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pcmpistrm $7, %xmm1, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
@@ -289,19 +259,12 @@
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define i32 @test_mm_cmpistro(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistro:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    seto %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistro:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    seto %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistro:
+; ALL:       # %bb.0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    pcmpistri $7, %xmm1, %xmm0
+; ALL-NEXT:    seto %al
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
@@ -310,19 +273,12 @@
 declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define i32 @test_mm_cmpistrs(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistrs:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    sets %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistrs:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    sets %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistrs:
+; ALL:       # %bb.0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    pcmpistri $7, %xmm1, %xmm0
+; ALL-NEXT:    sets %al
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
@@ -331,19 +287,12 @@
 declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define i32 @test_mm_cmpistrz(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_cmpistrz:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    sete %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpistrz:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmpistrz:
+; ALL:       # %bb.0:
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    pcmpistri $7, %xmm1, %xmm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
Index: utils/UpdateTestChecks/asm.py
===================================================================
--- utils/UpdateTestChecks/asm.py
+++ utils/UpdateTestChecks/asm.py
@@ -107,7 +107,7 @@
   asm = SCRUB_X86_RIP_RE.sub(r'{{.*}}(%rip)', asm)
   # Generically match a LCP symbol.
   asm = SCRUB_X86_LCP_RE.sub(r'{{\.LCPI.*}}', asm)
-  if getattr(args, 'x86_extra_scrub', False):
+  if getattr(args, 'extra_scrub', False):
     # Avoid generating different checks for 32- and 64-bit because of 'retl' vs 'retq'.
     asm = SCRUB_X86_RET_RE.sub(r'ret{{[l|q]}}', asm)
   # Strip kill operands inserted into the asm.
Index: utils/UpdateTestChecks/common.py
===================================================================
--- utils/UpdateTestChecks/common.py
+++ utils/UpdateTestChecks/common.py
@@ -3,6 +3,7 @@
 import string
 import subprocess
 import sys
+import copy
 
 if sys.version_info[0] > 2:
   class string:
@@ -80,13 +81,29 @@
   body = SCRUB_TRAILING_WHITESPACE_RE.sub(r'', body)
   return body
 
+def do_scrub(body, scrubber, scrubber_args, extra):
+  if scrubber_args:
+    local_args = copy.deepcopy(scrubber_args)
+    local_args[0].extra_scrub = extra
+    return scrubber(body, *local_args)
+  return scrubber(body, *scrubber_args)
+
 # Build up a dictionary of all the function bodies.
+class function_body(object):
+  def __init__(self, string, extrainfo):
+    self.string = string
+    self.extrainfo = extrainfo
+  def __str__(self):
+    return self.string
+
 def build_function_body_dictionary(function_re, scrubber, scrubber_args, raw_tool_output, prefixes, func_dict, verbose):
   for m in function_re.finditer(raw_tool_output):
     if not m:
       continue
     func = m.group('func')
-    scrubbed_body = scrubber(m.group('body'), *scrubber_args)
+    body = m.group('body')
+    scrubbed_body = do_scrub(body, scrubber, scrubber_args, False)
+    scrubbed_extra = do_scrub(body, scrubber, scrubber_args, True)
     if m.groupdict().has_key('analysis'):
       analysis = m.group('analysis')
       if analysis.lower() != 'cost model analysis':
@@ -99,15 +116,19 @@
       for l in scrubbed_body.splitlines():
         print('  ' + l, file=sys.stderr)
     for prefix in prefixes:
-      if func in func_dict[prefix] and func_dict[prefix][func] != scrubbed_body:
-        if prefix == prefixes[-1]:
-          print('WARNING: Found conflicting asm under the '
-                               'same prefix: %r!' % (prefix,), file=sys.stderr)
-        else:
-          func_dict[prefix][func] = None
+      if func in func_dict[prefix] and str(func_dict[prefix][func]) != scrubbed_body:
+        if func_dict[prefix][func] and func_dict[prefix][func].extrainfo == scrubbed_extra:
+          func_dict[prefix][func].string = scrubbed_extra
           continue
+        else:
+          if prefix == prefixes[-1]:
+            print('WARNING: Found conflicting asm under the '
+                                 'same prefix: %r!' % (prefix,), file=sys.stderr)
+          else:
+            func_dict[prefix][func] = None
+            continue
 
-      func_dict[prefix][func] = scrubbed_body
+      func_dict[prefix][func] = function_body(scrubbed_body, scrubbed_extra)
 
 ##### Generator of LLVM IR CHECK lines
 
@@ -188,7 +209,7 @@
 
       printed_prefixes.append(checkprefix)
       output_lines.append(check_label_format % (checkprefix, func_name))
-      func_body = func_dict[checkprefix][func_name].splitlines()
+      func_body = str(func_dict[checkprefix][func_name]).splitlines()
 
       # For ASM output, just emit the check lines.
       if is_asm == True:
Index: utils/update_llc_test_checks.py
===================================================================
--- utils/update_llc_test_checks.py
+++ utils/update_llc_test_checks.py
@@ -28,8 +28,8 @@
   parser.add_argument(
       '--function', help='The function in the test file to update')
   parser.add_argument(
-      '--x86_extra_scrub', action='store_true',
-      help='Use more regex for x86 matching to reduce diffs between various subtargets')
+      '--extra_scrub', action='store_true',
+      help='Use additional regex to further reduce diffs between various subtargets')
   parser.add_argument('tests', nargs='+')
   args = parser.parse_args()