Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -768,6 +768,28 @@ // TODO: Could compute known zero/one bits based on the input. break; } + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx2_pmovmskb: { + // MOVMSK copies the vector elements' sign bits to the low bits + // and zeros the high bits. + auto Arg = II->getArgOperand(0); + auto ArgType = cast(Arg->getType()); + unsigned ArgWidth = ArgType->getNumElements(); + + // If we don't need any of low bits then return zero, + // we know that DemandedMask is non-zero already. + APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); + if (DemandedElts == 0) + return ConstantInt::getNullValue(VTy); + + // We know that the upper bits are set to zero. + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - ArgWidth); + return nullptr; + } case Intrinsic::x86_sse42_crc32_64_64: KnownZero = APInt::getHighBitsSet(64, 32); return nullptr; Index: test/Transforms/InstCombine/x86-movmsk.ll =================================================================== --- test/Transforms/InstCombine/x86-movmsk.ll +++ test/Transforms/InstCombine/x86-movmsk.ll @@ -5,14 +5,12 @@ ; ; DemandedBits - MOVMSK zeros the upper bits of the result. -; TODO - we can get the and for free ; define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) { ; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) %2 = and i32 %1, 15 @@ -22,8 +20,7 @@ define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) { ; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 3 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) %2 = and i32 %1, 3 @@ -33,8 +30,7 @@ define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) { ; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 65535 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) %2 = and i32 %1, 65535 @@ -44,8 +40,7 @@ define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) { ; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 255 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) %2 = and i32 %1, 255 @@ -55,8 +50,7 @@ define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) { ; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) %2 = and i32 %1, 15 @@ -67,14 +61,11 @@ ; ; DemandedBits - If we don't use the lower bits then we just return zero. -; TODO - just return zero ; define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) { ; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -16 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 0 ; %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) %2 = and i32 %1, -16 @@ -83,9 +74,7 @@ define i32 @test_lower_x86_sse2_movmsk_pd(<2 x double> %a0) { ; CHECK-LABEL: @test_lower_x86_sse2_movmsk_pd( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -4 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 0 ; %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) %2 = and i32 %1, -4 @@ -94,9 +83,7 @@ define i32 @test_lower_x86_sse2_pmovmskb_128(<16 x i8> %a0) { ; CHECK-LABEL: @test_lower_x86_sse2_pmovmskb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -65536 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 0 ; %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) %2 = and i32 %1, -65536 @@ -105,9 +92,7 @@ define i32 @test_lower_x86_avx_movmsk_ps_256(<8 x float> %a0) { ; CHECK-LABEL: @test_lower_x86_avx_movmsk_ps_256( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -256 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 0 ; %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) %2 = and i32 %1, -256 @@ -116,9 +101,7 @@ define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) { ; CHECK-LABEL: @test_lower_x86_avx_movmsk_pd_256( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -16 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 0 ; %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) %2 = and i32 %1, -16