diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1151,14 +1151,20 @@ case Intrinsic::cttz: // FIXME: If necessary, this should go in target-specific overrides. - if (VF == 1 && RetVF == 1 && getTLI()->isCheapToSpeculateCttz()) - return TargetTransformInfo::TCC_Basic; + if (VF == 1 && RetVF == 1) { + if (getTLI()->isCheapToSpeculateCttz()) + return TargetTransformInfo::TCC_Basic; + return TargetTransformInfo::TCC_Expensive; + } break; case Intrinsic::ctlz: // FIXME: If necessary, this should go in target-specific overrides. - if (VF == 1 && RetVF == 1 && getTLI()->isCheapToSpeculateCtlz()) - return TargetTransformInfo::TCC_Basic; + if (VF == 1 && RetVF == 1) { + if (getTLI()->isCheapToSpeculateCtlz()) + return TargetTransformInfo::TCC_Basic; + return TargetTransformInfo::TCC_Expensive; + } break; case Intrinsic::memcpy: diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll --- a/llvm/test/Analysis/CostModel/X86/cttz.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz.ll @@ -16,7 +16,7 @@ define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -29,7 +29,7 @@ define i64 @var_cttz_i64u(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64u' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 true) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 true) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64u' @@ -42,7 +42,7 @@ define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' @@ -55,7 +55,7 @@ define i32 @var_cttz_i32u(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32u' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 true) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 true) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32u' @@ -68,7 +68,7 @@ define i16 @var_cttz_i16(i16 %a) { ; NOBMI-LABEL: 'var_cttz_i16' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz ; ; BMI-LABEL: 'var_cttz_i16' @@ -81,7 +81,7 @@ define i16 @var_cttz_i16u(i16 %a) { ; NOBMI-LABEL: 'var_cttz_i16u' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz ; ; BMI-LABEL: 'var_cttz_i16u' @@ -94,7 +94,7 @@ define i8 @var_cttz_i8(i8 %a) { ; NOBMI-LABEL: 'var_cttz_i8' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz ; ; BMI-LABEL: 'var_cttz_i8' @@ -107,7 +107,7 @@ define i8 @var_cttz_i8u(i8 %a) { ; NOBMI-LABEL: 'var_cttz_i8u' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz ; ; BMI-LABEL: 'var_cttz_i8u' diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -85,7 +85,7 @@ define void @cttz(i32 %a, <16 x i32> %va) { ; THRU-LABEL: 'cttz' -; THRU-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; THRU-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; THRU-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -95,12 +95,12 @@ ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'cttz' -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'cttz' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -435,9 +435,13 @@ define i32 @cttz_32_eq_select_ffs_m1(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_eq_select_ffs_m1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: testl %edi, %edi +; NOBMI-NEXT: je .LBB26_2 +; NOBMI-NEXT: # %bb.1: # %select.false.sink +; NOBMI-NEXT: bsfl %edi, %eax +; NOBMI-NEXT: retq +; NOBMI-NEXT: .LBB26_2: # %select.end ; NOBMI-NEXT: movl $-1, %eax -; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: retq ; ; BMI-LABEL: cttz_32_eq_select_ffs_m1: @@ -456,9 +460,13 @@ define i32 @cttz_32_ne_select_ffs_m1(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_ne_select_ffs_m1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: testl %edi, %edi +; NOBMI-NEXT: je .LBB27_2 +; NOBMI-NEXT: # %bb.1: # %select.true.sink +; NOBMI-NEXT: bsfl %edi, %eax +; NOBMI-NEXT: retq +; NOBMI-NEXT: .LBB27_2: # %select.end ; NOBMI-NEXT: movl $-1, %eax -; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: retq ; ; BMI-LABEL: cttz_32_ne_select_ffs_m1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll @@ -75,26 +75,11 @@ } define void @cttz_4i32() #0 { -; SSE2-LABEL: @cttz_4i32( -; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) -; SSE2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) -; SSE2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) -; SSE2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; SSE2-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @cttz_4i32( -; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 -; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 -; SSE42-NEXT: ret void +; SSE-LABEL: @cttz_4i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @cttz_4i32( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 @@ -133,41 +118,14 @@ } define void @cttz_8i32() #0 { -; SSE2-LABEL: @cttz_8i32( -; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE2-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE2-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE2-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE2-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) -; SSE2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) -; SSE2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) -; SSE2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; SSE2-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false) -; SSE2-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) -; SSE2-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) -; SSE2-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) -; SSE2-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE2-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE2-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE2-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE2-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE2-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE2-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE2-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @cttz_8i32( -; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false) -; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 -; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: ret void +; SSE-LABEL: @cttz_8i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @cttz_8i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 @@ -531,26 +489,11 @@ } define void @cttz_undef_4i32() #0 { -; SSE2-LABEL: @cttz_undef_4i32( -; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true) -; SSE2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) -; SSE2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) -; SSE2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; SSE2-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @cttz_undef_4i32( -; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 -; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) -; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 -; SSE42-NEXT: ret void +; SSE-LABEL: @cttz_undef_4i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: ret void ; ; AVX1-LABEL: @cttz_undef_4i32( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 @@ -589,41 +532,14 @@ } define void @cttz_undef_8i32() #0 { -; SSE2-LABEL: @cttz_undef_8i32( -; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE2-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE2-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE2-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE2-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true) -; SSE2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) -; SSE2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) -; SSE2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; SSE2-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 true) -; SSE2-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true) -; SSE2-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true) -; SSE2-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true) -; SSE2-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE2-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE2-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE2-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE2-NEXT: store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE2-NEXT: store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE2-NEXT: store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE2-NEXT: store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 -; SSE2-NEXT: ret void -; -; SSE42-LABEL: @cttz_undef_8i32( -; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) -; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true) -; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 -; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: ret void +; SSE-LABEL: @cttz_undef_8i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @cttz_undef_8i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 diff --git a/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll b/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll --- a/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll @@ -407,22 +407,64 @@ } define i16 @test9_loop(i32 %x, i16* %ptr) { -; ALL-LABEL: @test9_loop( -; ALL-NEXT: entry: -; ALL-NEXT: br label [[LOOP_HEADER:%.*]] -; ALL: loop.header: -; ALL-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ] -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 -; ALL-NEXT: [[XOR:%.*]] = xor i32 [[X]], -1 -; ALL-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]] -; ALL-NEXT: store i16 [[COND]], i16* [[PTR:%.*]], align 2 -; ALL-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; ALL-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 100 -; ALL-NEXT: br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]] -; ALL: loop.exit: -; ALL-NEXT: ret i16 [[COND]] +; BMI-LABEL: @test9_loop( +; BMI-NEXT: entry: +; BMI-NEXT: br label [[LOOP_HEADER:%.*]] +; BMI: loop.header: +; BMI-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ] +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; BMI-NEXT: [[XOR:%.*]] = xor i32 [[X]], -1 +; BMI-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; BMI-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]] +; BMI-NEXT: store i16 [[COND]], i16* [[PTR:%.*]], align 2 +; BMI-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; BMI-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 100 +; BMI-NEXT: br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]] +; BMI: loop.exit: +; BMI-NEXT: ret i16 [[COND]] +; +; LZCNT-LABEL: @test9_loop( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: br label [[LOOP_HEADER:%.*]] +; LZCNT: loop.header: +; LZCNT-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[COND_END:%.*]] ] +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; LZCNT-NEXT: br i1 [[TOBOOL]], label [[COND_END]], label [[COND_TRUE:%.*]] +; LZCNT: cond.true: +; LZCNT-NEXT: [[XOR:%.*]] = xor i32 [[X]], -1 +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; LZCNT-NEXT: br label [[COND_END]] +; LZCNT: cond.end: +; LZCNT-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[LOOP_HEADER]] ] +; LZCNT-NEXT: store i16 [[COND]], i16* [[PTR:%.*]], align 2 +; LZCNT-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; LZCNT-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 100 +; LZCNT-NEXT: br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]] +; LZCNT: loop.exit: +; LZCNT-NEXT: ret i16 [[COND]] +; +; GENERIC-LABEL: @test9_loop( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: br label [[LOOP_HEADER:%.*]] +; GENERIC: loop.header: +; GENERIC-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[COND_END:%.*]] ] +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[XOR:%.*]] = xor i32 [[X]], -1 +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[XOR]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[LOOP_HEADER]] ] +; GENERIC-NEXT: store i16 [[COND]], i16* [[PTR:%.*]], align 2 +; GENERIC-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; GENERIC-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 100 +; GENERIC-NEXT: br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP_HEADER]] +; GENERIC: loop.exit: +; GENERIC-NEXT: ret i16 [[COND]] ; entry: br label %loop.header