Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -928,6 +928,63 @@ return ReplaceInstUsesWith(*II, V); break; + case Intrinsic::x86_sse4a_extrq: { + // EXTRQ uses only the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + auto Op0 = II->getArgOperand(0); + auto Op1 = II->getArgOperand(1); + unsigned VWidth0 = cast(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast(Op1->getType())->getNumElements(); + assert(VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes"); + + APInt DemandedElts0 = APInt::getLowBitsSet(VWidth0, 1); + APInt UndefElts0(VWidth0, 0); + if (Value *V = SimplifyDemandedVectorElts(Op0, DemandedElts0, UndefElts0)) { + II->setArgOperand(0, V); + return II; + } + + APInt DemandedElts1 = APInt::getLowBitsSet(VWidth1, 2); + APInt UndefElts1(VWidth1, 0); + if (Value *V = SimplifyDemandedVectorElts(Op1, DemandedElts1, UndefElts1)) { + II->setArgOperand(1, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI uses only the lowest 64-bits of the first 128-bit vector + // operand. + auto Op = II->getArgOperand(0); + unsigned VWidth = cast(Op->getType())->getNumElements(); + assert(VWidth == 2 && "Unexpected operand size"); + + APInt DemandedElts = APInt::getLowBitsSet(VWidth, 1); + APInt UndefElts(VWidth, 0); + if (Value *V = SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + // INSERTQ uses only the lowest 64-bits of the first 128-bit vector + // operand. + auto Op = II->getArgOperand(0); + unsigned VWidth = cast(Op->getType())->getNumElements(); + assert(VWidth == 2 && "Unexpected operand size"); + + APInt DemandedElts = APInt::getLowBitsSet(VWidth, 1); + APInt UndefElts(VWidth, 0); + if (Value *V = SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts)) { + II->setArgOperand(0, V); + return II; + } + break; + } + case Intrinsic::x86_sse4a_insertqi: { // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top // ones undef @@ -1004,6 +1061,28 @@ } } } + + // INSERTQI uses only the lowest 64-bits of the first two 128-bit vector + // operands. + auto Op0 = II->getArgOperand(0); + auto Op1 = II->getArgOperand(1); + unsigned VWidth0 = cast(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast(Op1->getType())->getNumElements(); + assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes"); + + APInt DemandedElts0 = APInt::getLowBitsSet(VWidth0, 1); + APInt UndefElts0(VWidth0, 0); + if (Value *V = SimplifyDemandedVectorElts(Op0, DemandedElts0, UndefElts0)) { + II->setArgOperand(0, V); + return II; + } + + APInt DemandedElts1 = APInt::getLowBitsSet(VWidth1, 1); + APInt UndefElts1(VWidth1, 0); + if (Value *V = SimplifyDemandedVectorElts(Op1, DemandedElts1, UndefElts1)) { + II->setArgOperand(1, V); + return II; + } break; } Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -412,7 +412,7 @@ Value *LHS, *RHS; if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; - + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, @@ -1237,6 +1237,15 @@ // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; break; + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); + break; } break; } Index: test/Transforms/InstCombine/x86-sse4a.ll =================================================================== --- test/Transforms/InstCombine/x86-sse4a.ll +++ test/Transforms/InstCombine/x86-sse4a.ll @@ -1,125 +1,244 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s - -; We should optimize these two redundant insertqi into one -; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { -; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) -; CHECK-NOT: insertqi - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) - ret <2 x i64> %2 -} - -; The result of this insert is the second arg, since the top 64 bits of -; the result are undefined, and we copy the bottom 64 bits from the -; second arg -; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> %i - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) - ret <2 x i64> %1 -} - -; Test the several types of ranges and ordering that exist for two insertqi -; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> %i - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0) - ret <2 x i64> %1 -} - -; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16) - ret <2 x i64> %1 -} - -; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32) - ret <2 x i64> %1 -} - -; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16) - ret <2 x i64> %1 -} - -; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi -declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind +; RUN: opt < %s -instcombine -S | FileCheck %s + +; We should optimize these two redundant insertqi into one +; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { +; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) +; CHECK-NOT: insertqi + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %2 +} + +; The result of this insert is the second arg, since the top 64 bits of +; the result are undefined, and we copy the bottom 64 bits from the +; second arg +; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) + ret <2 x i64> %1 +} + +; Test the several types of ranges and ordering that exist for two insertqi +; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0) + ret <2 x i64> %1 +} + +; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16) + ret <2 x i64> %1 +} + +; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32) + ret <2 x i64> %1 +} + +; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16) + ret <2 x i64> %1 +} + +; +; Vector Demanded Bits +; + +define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %y) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_arg1 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_args01 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> + %3 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %2) nounwind + ret <2 x i64> %3 +} + +define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrq_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrqi_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %1, i8 3, i8 2) + ret <2 x i64> %2 +} + +define <2 x i64> @test_extrqi_ret(<2 x i64> %x) nounwind uwtable ssp { +; CHECK-LABEL: @test_extrqi_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertq_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %1, <2 x i64> %y) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertq_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_arg0 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %y, i8 3, i8 2) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_arg1 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %1, i8 3, i8 2) nounwind + ret <2 x i64> %2 +} + +define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_args01 +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> + %2 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> + %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 3, i8 2) nounwind + ret <2 x i64> %3 +} + +define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +; CHECK-LABEL: @test_insertqi_ret +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ret <2 x i64> %2 +} + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrq +declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrqi +declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertq +declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi +declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind