Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -325,6 +325,45 @@ return Builder.CreateAShr(Vec, ShiftVec); } +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *Arg = II.getArgOperand(0); + Type *ResTy = II.getType(); + Type *ArgTy = Arg->getType(); + + // movmsk(undef) -> zero as we must ensure the upper bits are zero. + if (isa(Arg)) + return Constant::getNullValue(ResTy); + + // We can't easily peek through x86_mmx types. + if (!ArgTy->isVectorTy()) + return nullptr; + + auto *C = dyn_cast(Arg); + if (!C) + return nullptr; + + // Extract signbits of the vector input and pack into integer result. + APInt Result(ResTy->getPrimitiveSizeInBits(), 0); + for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { + auto *COp = C->getAggregateElement(I); + if (!COp) + return nullptr; + if (isa(COp)) + continue; + + auto *CInt = dyn_cast(COp); + auto *CFp = dyn_cast(COp); + if (!CInt && !CFp) + return nullptr; + + if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) + Result.setBit(I); + } + + return Constant::getIntegerValue(ResTy, Result); +} + static Value *simplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { auto *CInt = dyn_cast(II.getArgOperand(2)); @@ -1460,6 +1499,18 @@ break; } + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx2_pmovmskb: { + if (Value *V = simplifyX86movmsk(*II, *Builder)) + return replaceInstUsesWith(*II, V); + break; + } + case Intrinsic::x86_sse_comieq_ss: case Intrinsic::x86_sse_comige_ss: case Intrinsic::x86_sse_comigt_ss: Index: llvm/trunk/test/Transforms/InstCombine/x86-movmsk.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-movmsk.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-movmsk.ll @@ -129,6 +129,190 @@ ; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register. +; +; Constant Folding (UNDEF -> ZERO) +; + +define i32 @undef_x86_mmx_pmovmskb() { +; CHECK-LABEL: @undef_x86_mmx_pmovmskb( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef) + ret i32 %1 +} + +define i32 @undef_x86_sse_movmsk_ps() { +; CHECK-LABEL: @undef_x86_sse_movmsk_ps( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> undef) + ret i32 %1 +} + +define i32 @undef_x86_sse2_movmsk_pd() { +; CHECK-LABEL: @undef_x86_sse2_movmsk_pd( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> undef) + ret i32 %1 +} + +define i32 @undef_x86_sse2_pmovmskb_128() { +; CHECK-LABEL: @undef_x86_sse2_pmovmskb_128( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> undef) + ret i32 %1 +} + +define i32 @undef_x86_avx_movmsk_ps_256() { +; CHECK-LABEL: @undef_x86_avx_movmsk_ps_256( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> undef) + ret i32 %1 +} + +define i32 @undef_x86_avx_movmsk_pd_256() { +; CHECK-LABEL: @undef_x86_avx_movmsk_pd_256( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> undef) + ret i32 %1 +} + +define i32 @undef_x86_avx2_pmovmskb() { +; CHECK-LABEL: @undef_x86_avx2_pmovmskb( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> undef) + ret i32 %1 +} + +; +; Constant Folding (ZERO -> ZERO) +; + +define i32 @zero_x86_mmx_pmovmskb() { +; CHECK-LABEL: @zero_x86_mmx_pmovmskb( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx)) +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = bitcast <1 x i64> zeroinitializer to x86_mmx + %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1) + ret i32 %2 +} + +define i32 @zero_x86_sse_movmsk_ps() { +; CHECK-LABEL: @zero_x86_sse_movmsk_ps( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> zeroinitializer) + ret i32 %1 +} + +define i32 @zero_x86_sse2_movmsk_pd() { +; CHECK-LABEL: @zero_x86_sse2_movmsk_pd( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> zeroinitializer) + ret i32 %1 +} + +define i32 @zero_x86_sse2_pmovmskb_128() { +; CHECK-LABEL: @zero_x86_sse2_pmovmskb_128( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> zeroinitializer) + ret i32 %1 +} + +define i32 @zero_x86_avx_movmsk_ps_256() { +; CHECK-LABEL: @zero_x86_avx_movmsk_ps_256( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> zeroinitializer) + ret i32 %1 +} + +define i32 @zero_x86_avx_movmsk_pd_256() { +; CHECK-LABEL: @zero_x86_avx_movmsk_pd_256( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> zeroinitializer) + ret i32 %1 +} + +define i32 @zero_x86_avx2_pmovmskb() { +; CHECK-LABEL: @zero_x86_avx2_pmovmskb( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> zeroinitializer) + ret i32 %1 +} + +; +; Constant Folding +; + +define i32 @fold_x86_mmx_pmovmskb() { +; CHECK-LABEL: @fold_x86_mmx_pmovmskb( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> to x86_mmx)) +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = bitcast <8 x i8> to x86_mmx + %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1) + ret i32 %2 +} + +define i32 @fold_x86_sse_movmsk_ps() { +; CHECK-LABEL: @fold_x86_sse_movmsk_ps( +; CHECK-NEXT: ret i32 10 +; + %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> ) + ret i32 %1 +} + +define i32 @fold_x86_sse2_movmsk_pd() { +; CHECK-LABEL: @fold_x86_sse2_movmsk_pd( +; CHECK-NEXT: ret i32 2 +; + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> ) + ret i32 %1 +} + +define i32 @fold_x86_sse2_pmovmskb_128() { +; CHECK-LABEL: @fold_x86_sse2_pmovmskb_128( +; CHECK-NEXT: ret i32 5654 +; + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> ) + ret i32 %1 +} + +define i32 @fold_x86_avx_movmsk_ps_256() { +; CHECK-LABEL: @fold_x86_avx_movmsk_ps_256( +; CHECK-NEXT: ret i32 170 +; + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> ) + ret i32 %1 +} + +define i32 @fold_x86_avx_movmsk_pd_256() { +; CHECK-LABEL: @fold_x86_avx_movmsk_pd_256( +; CHECK-NEXT: ret i32 10 +; + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> ) + ret i32 %1 +} + +define i32 @fold_x86_avx2_pmovmskb() { +; CHECK-LABEL: @fold_x86_avx2_pmovmskb( +; CHECK-NEXT: ret i32 370546176 +; + %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> ) + ret i32 %1 +} + declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)