Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -527,6 +527,13 @@ if (Changed) return II; } + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) + { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { @@ -975,6 +982,54 @@ return ReplaceInstUsesWith(*II, V); break; + case Intrinsic::x86_sse4a_extrq: { + // EXTRQ uses only the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes"); + + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + II->setArgOperand(1, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI uses only the lowest 64-bits of the first 128-bit vector + // operand. + Value *Op = II->getArgOperand(0); + unsigned VWidth = Op->getType()->getVectorNumElements(); + assert(VWidth == 2 && "Unexpected operand size"); + + if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + // INSERTQ uses only the lowest 64-bits of the first 128-bit vector + // operand. + Value *Op = II->getArgOperand(0); + unsigned VWidth = Op->getType()->getVectorNumElements(); + assert(VWidth == 2 && "Unexpected operand size"); + + if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + case Intrinsic::x86_sse4a_insertqi: { // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top // ones undef @@ -1051,6 +1106,24 @@ } } } + + // INSERTQI uses only the lowest 64-bits of the first two 128-bit vector + // operands. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes"); + + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + II->setArgOperand(1, V); + return II; + } break; } Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -412,7 +412,7 @@ Value *LHS, *RHS; if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; - + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, @@ -1237,6 +1237,15 @@ // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; break; + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); + break; } break; } Index: test/Transforms/InstCombine/x86-sse4a.ll =================================================================== --- test/Transforms/InstCombine/x86-sse4a.ll +++ test/Transforms/InstCombine/x86-sse4a.ll @@ -121,5 +121,124 @@ ret <2 x i64> %1 } +; +; Vector Demanded Bits +; + +define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %y) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_arg1 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_args01 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> + %3 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %2) nounwind + ret <2 x i64> %3 +} + +define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrqi_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %1, i8 3, i8 2) + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrqi_ret(<2 x i64> %x) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrqi_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertq_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %1, <2 x i64> %y) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertq_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %y, i8 3, i8 2) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_arg1 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %1, i8 3, i8 2) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_args01 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> + %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 3, i8 2) nounwind + ret <2 x i64> %3 +} + +define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrq +declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrqi +declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertq +declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind + ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind