diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1151,14 +1151,20 @@
 
     case Intrinsic::cttz:
       // FIXME: If necessary, this should go in target-specific overrides.
-      if (VF == 1 && RetVF == 1 && getTLI()->isCheapToSpeculateCttz())
-        return TargetTransformInfo::TCC_Basic;
+      if (VF == 1 && RetVF == 1) {
+        if (getTLI()->isCheapToSpeculateCttz())
+          return TargetTransformInfo::TCC_Basic;
+        return TargetTransformInfo::TCC_Expensive;
+      }
       break;
 
     case Intrinsic::ctlz:
       // FIXME: If necessary, this should go in target-specific overrides.
-      if (VF == 1 && RetVF == 1 && getTLI()->isCheapToSpeculateCtlz())
-        return TargetTransformInfo::TCC_Basic;
+      if (VF == 1 && RetVF == 1) {
+        if (getTLI()->isCheapToSpeculateCtlz())
+          return TargetTransformInfo::TCC_Basic;
+        return TargetTransformInfo::TCC_Expensive;
+      }
       break;
 
     case Intrinsic::memcpy:
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -16,7 +16,7 @@
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -29,7 +29,7 @@
 
 define i64 @var_cttz_i64u(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 true)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 true)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64u'
@@ -42,7 +42,7 @@
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
@@ -55,7 +55,7 @@
 
 define i32 @var_cttz_i32u(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 true)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32u'
@@ -68,7 +68,7 @@
 
 define i16 @var_cttz_i16(i16 %a) {
 ; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i16'
@@ -81,7 +81,7 @@
 
 define i16 @var_cttz_i16u(i16 %a) {
 ; NOBMI-LABEL: 'var_cttz_i16u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i16u'
@@ -94,7 +94,7 @@
 
 define i8 @var_cttz_i8(i8 %a) {
 ; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i8'
@@ -107,7 +107,7 @@
 
 define i8 @var_cttz_i8u(i8 %a) {
 ; NOBMI-LABEL: 'var_cttz_i8u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i8u'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -85,7 +85,7 @@
 
 define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-LABEL: 'cttz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -95,12 +95,12 @@
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'cttz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll
--- a/llvm/test/CodeGen/X86/dagcombine-select.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-select.ll
@@ -435,9 +435,13 @@
 define i32 @cttz_32_eq_select_ffs_m1(i32 %v) nounwind {
 ; NOBMI-LABEL: cttz_32_eq_select_ffs_m1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    bsfl %edi, %ecx
+; NOBMI-NEXT:    testl %edi, %edi
+; NOBMI-NEXT:    je .LBB26_2
+; NOBMI-NEXT:  # %bb.1: # %select.false.sink
+; NOBMI-NEXT:    bsfl %edi, %eax
+; NOBMI-NEXT:    retq
+; NOBMI-NEXT:  .LBB26_2: # %select.end
 ; NOBMI-NEXT:    movl $-1, %eax
-; NOBMI-NEXT:    cmovnel %ecx, %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: cttz_32_eq_select_ffs_m1:
@@ -456,9 +460,13 @@
 define i32 @cttz_32_ne_select_ffs_m1(i32 %v) nounwind {
 ; NOBMI-LABEL: cttz_32_ne_select_ffs_m1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    bsfl %edi, %ecx
+; NOBMI-NEXT:    testl %edi, %edi
+; NOBMI-NEXT:    je .LBB27_2
+; NOBMI-NEXT:  # %bb.1: # %select.true.sink
+; NOBMI-NEXT:    bsfl %edi, %eax
+; NOBMI-NEXT:    retq
+; NOBMI-NEXT:  .LBB27_2: # %select.end
 ; NOBMI-NEXT:    movl $-1, %eax
-; NOBMI-NEXT:    cmovnel %ecx, %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: cttz_32_ne_select_ffs_m1:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -75,26 +75,11 @@
 }
 
 define void @cttz_4i32() #0 {
-; SSE2-LABEL: @cttz_4i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
-; SSE2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
-; SSE2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
-; SSE2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
-; SSE2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @cttz_4i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE42-NEXT:    ret void
+; SSE-LABEL: @cttz_4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @cttz_4i32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
@@ -133,41 +118,14 @@
 }
 
 define void @cttz_8i32() #0 {
-; SSE2-LABEL: @cttz_8i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
-; SSE2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
-; SSE2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
-; SSE2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
-; SSE2-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false)
-; SSE2-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
-; SSE2-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
-; SSE2-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
-; SSE2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE2-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE2-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE2-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE2-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @cttz_8i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    ret void
+; SSE-LABEL: @cttz_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @cttz_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
@@ -531,26 +489,11 @@
 }
 
 define void @cttz_undef_4i32() #0 {
-; SSE2-LABEL: @cttz_undef_4i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
-; SSE2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
-; SSE2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
-; SSE2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
-; SSE2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @cttz_undef_4i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE42-NEXT:    ret void
+; SSE-LABEL: @cttz_undef_4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @cttz_undef_4i32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
@@ -589,41 +532,14 @@
 }
 
 define void @cttz_undef_8i32() #0 {
-; SSE2-LABEL: @cttz_undef_8i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
-; SSE2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
-; SSE2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
-; SSE2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
-; SSE2-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 true)
-; SSE2-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true)
-; SSE2-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true)
-; SSE2-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true)
-; SSE2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE2-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE2-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE2-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE2-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @cttz_undef_8i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    ret void
+; SSE-LABEL: @cttz_undef_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true)
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @cttz_undef_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll b/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll
--- a/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll
@@ -407,22 +407,64 @@
 }
 
 define i16 @test9_loop(i32 %x, i16* %ptr) {
-; ALL-LABEL: @test9_loop(
-; ALL-NEXT:  entry:
-; ALL-NEXT:    br label [[LOOP_HEADER:%.*]]
-; ALL:       loop.header:
-; ALL-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ]
-; ALL-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0
-; ALL-NEXT:    [[XOR:%.*]] = xor i32 [[X]], -1
-; ALL-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true)
-; ALL-NEXT:    [[CAST:%.*]] = trunc i32 [[TMP0]] to i16
-; ALL-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]]
-; ALL-NEXT:    store i16 [[COND]], i16* [[PTR:%.*]], align 2
-; ALL-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; ALL-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
-; ALL-NEXT:    br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]]
-; ALL:       loop.exit:
-; ALL-NEXT:    ret i16 [[COND]]
+; BMI-LABEL: @test9_loop(
+; BMI-NEXT:  entry:
+; BMI-NEXT:    br label [[LOOP_HEADER:%.*]]
+; BMI:       loop.header:
+; BMI-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ]
+; BMI-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0
+; BMI-NEXT:    [[XOR:%.*]] = xor i32 [[X]], -1
+; BMI-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true)
+; BMI-NEXT:    [[CAST:%.*]] = trunc i32 [[TMP0]] to i16
+; BMI-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]]
+; BMI-NEXT:    store i16 [[COND]], i16* [[PTR:%.*]], align 2
+; BMI-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; BMI-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
+; BMI-NEXT:    br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]]
+; BMI:       loop.exit:
+; BMI-NEXT:    ret i16 [[COND]]
+;
+; LZCNT-LABEL: @test9_loop(
+; LZCNT-NEXT:  entry:
+; LZCNT-NEXT:    br label [[LOOP_HEADER:%.*]]
+; LZCNT:       loop.header:
+; LZCNT-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[COND_END:%.*]] ]
+; LZCNT-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0
+; LZCNT-NEXT:    br i1 [[TOBOOL]], label [[COND_END]], label [[COND_TRUE:%.*]]
+; LZCNT:       cond.true:
+; LZCNT-NEXT:    [[XOR:%.*]] = xor i32 [[X]], -1
+; LZCNT-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true)
+; LZCNT-NEXT:    [[CAST:%.*]] = trunc i32 [[TMP0]] to i16
+; LZCNT-NEXT:    br label [[COND_END]]
+; LZCNT:       cond.end:
+; LZCNT-NEXT:    [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[LOOP_HEADER]] ]
+; LZCNT-NEXT:    store i16 [[COND]], i16* [[PTR:%.*]], align 2
+; LZCNT-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; LZCNT-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
+; LZCNT-NEXT:    br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]]
+; LZCNT:       loop.exit:
+; LZCNT-NEXT:    ret i16 [[COND]]
+;
+; GENERIC-LABEL: @test9_loop(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    br label [[LOOP_HEADER:%.*]]
+; GENERIC:       loop.header:
+; GENERIC-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[COND_END:%.*]] ]
+; GENERIC-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0
+; GENERIC-NEXT:    br i1 [[TOBOOL]], label [[COND_END]], label [[COND_TRUE:%.*]]
+; GENERIC:       cond.true:
+; GENERIC-NEXT:    [[XOR:%.*]] = xor i32 [[X]], -1
+; GENERIC-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true)
+; GENERIC-NEXT:    [[CAST:%.*]] = trunc i32 [[TMP0]] to i16
+; GENERIC-NEXT:    br label [[COND_END]]
+; GENERIC:       cond.end:
+; GENERIC-NEXT:    [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[LOOP_HEADER]] ]
+; GENERIC-NEXT:    store i16 [[COND]], i16* [[PTR:%.*]], align 2
+; GENERIC-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; GENERIC-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
+; GENERIC-NEXT:    br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]]
+; GENERIC:       loop.exit:
+; GENERIC-NEXT:    ret i16 [[COND]]
 ;
 entry:
   br label %loop.header