Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -968,14 +968,49 @@ break; case Intrinsic::x86_sse4a_extrq: { - // EXTRQ uses only the lowest 64-bits of the first 128-bit vector - // operands and the lowest 16-bits of the second. Value *Op0 = II->getArgOperand(0); Value *Op1 = II->getArgOperand(1); unsigned VWidth0 = Op0->getType()->getVectorNumElements(); unsigned VWidth1 = Op1->getType()->getVectorNumElements(); assert(VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes"); + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI0 = + C0 ? dyn_cast(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CILength = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Constant Fold - if Op1 is constant - convert to extrqi. + if (CILength && CIIndex) { + unsigned Length = CILength->getValue().zextOrTrunc(6).getZExtValue(); + unsigned Index = CIIndex->getValue().zextOrTrunc(6).getZExtValue(); + + Type *IntTy8 = Type::getInt8Ty(II->getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = CI.getParent()->getParent()->getParent(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->equalsInt(0)) { + Type *IntTy64 = Type::getInt64Ty(II->getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, 0), + UndefValue::get(IntTy64)}; + return ReplaceInstUsesWith(CI, ConstantVector::get(Args)); + } + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { II->setArgOperand(0, V); return II; @@ -988,13 +1023,91 @@ } case Intrinsic::x86_sse4a_extrqi: { - // EXTRQI uses only the lowest 64-bits of the first 128-bit vector - // operand. - Value *Op = II->getArgOperand(0); - unsigned VWidth = Op->getType()->getVectorNumElements(); - assert(VWidth == 2 && "Unexpected operand size"); + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II->getArgOperand(0); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II->getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ReplaceInstUsesWith(CI, ConstantVector::get(Args)); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast(C0->getAggregateElement((unsigned)0)) + : nullptr; + + if (ConstantInt *CILength = dyn_cast(II->getArgOperand(1))) { + if (ConstantInt *CIIndex = dyn_cast(II->getArgOperand(2))) { + unsigned Index = CIIndex->getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = + CILength->equalsInt(0) ? 64 : CILength->getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II->getContext()); + Type *IntTy32 = Type::getInt32Ty(II->getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + SmallVector ShuffleMask; + for (int i = 0; i != Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = + Builder->CreateShuffleVector(Builder->CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), + ConstantVector::get(ShuffleMask)); + return ReplaceInstUsesWith(CI, + Builder->CreateBitCast(SV, II->getType())); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt = Elt.lshr(Index).zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + } + } - if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) { + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->equalsInt(0)) + return LowConstantHighUndef(0); + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { II->setArgOperand(0, V); return II; } @@ -1002,13 +1115,36 @@ } case Intrinsic::x86_sse4a_insertq: { - // INSERTQ uses only the lowest 64-bits of the first 128-bit vector - // operand. - Value *Op = II->getArgOperand(0); - unsigned VWidth = Op->getType()->getVectorNumElements(); - assert(VWidth == 2 && "Unexpected operand size"); + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op1->getType()->getVectorNumElements() == 2 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Constant Fold - if Op1 is constant - convert to insertqi. + if (CI11) { + APInt V11 = CI11->getValue(); + unsigned Length = V11.zextOrTrunc(6).getZExtValue(); + unsigned Index = V11.lshr(8).zextOrTrunc(6).getZExtValue(); + + Type *IntTy8 = Type::getInt8Ty(II->getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = CI.getParent()->getParent()->getParent(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); + } - if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) { + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { II->setArgOperand(0, V); return II; } @@ -1016,15 +1152,23 @@ } case Intrinsic::x86_sse4a_insertqi: { - // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top - // ones undef - // TODO: eventually we should lower this intrinsic to IR + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes"); + if (auto CILength = dyn_cast(II->getArgOperand(2))) { if (auto CIIndex = dyn_cast(II->getArgOperand(3))) { unsigned Index = CIIndex->getZExtValue(); + // From AMD documentation: "a value of zero in the field length is // defined as length of 64". - unsigned Length = CILength->equalsInt(0) ? 64 : CILength->getZExtValue(); + unsigned Length = + CILength->equalsInt(0) ? 64 : CILength->getZExtValue(); // From AMD documentation: "If the sum of the bit index + length field // is greater than 64, the results are undefined". @@ -1037,69 +1181,66 @@ if (End > 64) return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - if (Length == 64 && Index == 0) { - Value *Vec = II->getArgOperand(1); - Value *Undef = UndefValue::get(Vec->getType()); - const uint32_t Mask[] = { 0, 2 }; - return ReplaceInstUsesWith( - CI, - Builder->CreateShuffleVector( - Vec, Undef, ConstantDataVector::get( - II->getContext(), makeArrayRef(Mask)))); - } else if (auto Source = - dyn_cast(II->getArgOperand(0))) { - if (Source->hasOneUse() && - Source->getArgOperand(1) == II->getArgOperand(1)) { - // If the source of the insert has only one use and it's another - // insert (and they're both inserting from the same vector), try to - // bundle both together. - auto CISourceLength = - dyn_cast(Source->getArgOperand(2)); - auto CISourceIndex = - dyn_cast(Source->getArgOperand(3)); - if (CISourceIndex && CISourceLength) { - unsigned SourceIndex = CISourceIndex->getZExtValue(); - unsigned SourceLength = CISourceLength->getZExtValue(); - unsigned SourceEnd = SourceIndex + SourceLength; - unsigned NewIndex, NewLength; - bool ShouldReplace = false; - if (Index <= SourceIndex && SourceIndex <= End) { - NewIndex = Index; - NewLength = std::max(End, SourceEnd) - NewIndex; - ShouldReplace = true; - } else if (SourceIndex <= Index && Index <= SourceEnd) { - NewIndex = SourceIndex; - NewLength = std::max(SourceEnd, End) - NewIndex; - ShouldReplace = true; - } - - if (ShouldReplace) { - Constant *ConstantLength = ConstantInt::get( - II->getArgOperand(2)->getType(), NewLength, false); - Constant *ConstantIndex = ConstantInt::get( - II->getArgOperand(3)->getType(), NewIndex, false); - Value *Args[4] = { Source->getArgOperand(0), - II->getArgOperand(1), ConstantLength, - ConstantIndex }; - Module *M = CI.getParent()->getParent()->getParent(); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); - } - } - } + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II->getContext()); + Type *IntTy32 = Type::getInt32Ty(II->getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + SmallVector ShuffleMask; + for (int i = 0; i != Index; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 0; i != Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = + Builder->CreateShuffleVector(Builder->CreateBitCast(Op0, ShufTy), + Builder->CreateBitCast(Op1, ShufTy), + ConstantVector::get(ShuffleMask)); + return ReplaceInstUsesWith(CI, + Builder->CreateBitCast(SV, II->getType())); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th + // bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II->getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ReplaceInstUsesWith(CI, ConstantVector::get(Args)); } } } - // INSERTQI uses only the lowest 64-bits of the first two 128-bit vector + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector // operands. - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - unsigned VWidth0 = Op0->getType()->getVectorNumElements(); - unsigned VWidth1 = Op1->getType()->getVectorNumElements(); - assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes"); - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { II->setArgOperand(0, V); return II; @@ -1183,9 +1324,9 @@ // control mask is set, then zero is written in the result byte. // The zero vector is in the right-hand side of the resulting // shufflevector. - + // The value of each index is the least significant 4 bits of the - // shuffle control byte. + // shuffle control byte. Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; } } else if (!isa(V)) Index: test/Transforms/InstCombine/x86-sse4a.ll =================================================================== --- test/Transforms/InstCombine/x86-sse4a.ll +++ test/Transforms/InstCombine/x86-sse4a.ll @@ -1,122 +1,186 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s -; We should optimize these two redundant insertqi into one -; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { -; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) -; CHECK-NOT: insertqi - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) - ret <2 x i64> %2 +; +; EXTRQ +; + +define <2 x i64> @test_extrq_call(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_call +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind + ret <2 x i64> %1 } -; The result of this insert is the second arg, since the top 64 bits of -; the result are undefined, and we copy the bottom 64 bits from the -; second arg -; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> %i - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) +define <2 x i64> @test_extrq_zero_arg0(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_zero_arg0 +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> zeroinitializer, <16 x i8> %y) nounwind ret <2 x i64> %1 } -; Test the several types of ranges and ordering that exist for two insertqi -; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) - ret <2 x i64> %2 +define <2 x i64> @test_extrq_zero_arg1(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_zero_arg1 +; CHECK-NEXT: ret <2 x i64> %x + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> zeroinitializer) nounwind + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 +define <2 x i64> @test_extrq_to_extqi(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_to_extqi +; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 15) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> ) nounwind + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) - ret <2 x i64> %2 +; +; EXTRQI +; + +define <2 x i64> @test_extrqi_call(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_call +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23) + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 +define <2 x i64> @test_extrqi_shuffle_1zuu(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_shuffle_1zuu +; CHECK-NEXT: %1 = bitcast <2 x i64> %x to <16 x i8> +; CHECK-NEXT: %2 = shufflevector <16 x i8> %1, <16 x i8> , <16 x i32> +; CHECK-NEXT: %3 = bitcast <16 x i8> %2 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %3 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 32, i8 32) + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 +define <2 x i64> @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_shuffle_2zzzzzzzuuuuuuuu +; CHECK-NEXT: %1 = bitcast <2 x i64> %x to <16 x i8> +; CHECK-NEXT: %2 = shufflevector <16 x i8> %1, <16 x i8> , <16 x i32> +; CHECK-NEXT: %3 = bitcast <16 x i8> %2 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %3 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 16) + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 +define <2 x i64> @test_extrqi_undef(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_undef +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 32, i8 33) + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 +define <2 x i64> @test_extrqi_zero(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_zero +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 3, i8 18) + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) +define <2 x i64> @test_extrqi_constant(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_constant +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> , i8 3, i8 18) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_constant_undef(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_constant_undef +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> , i8 3, i8 18) + ret <2 x i64> %1 +} + +; +; INSERTQ +; + +define <2 x i64> @test_insertq_call(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: @test_insertq_call +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertq_to_insertqi(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: @test_insertq_to_insertqi +; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> , i8 18, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> ) nounwind + ret <2 x i64> %1 +} + +; +; INSERTQI +; + +define <2 x i64> @test_insertqi_shuffle_04uu(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @test_insertqi_shuffle_04uu +; CHECK-NEXT: %1 = bitcast <2 x i64> %i to <16 x i8> +; CHECK-NEXT: %2 = bitcast <2 x i64> %v to <16 x i8> +; CHECK-NEXT: %3 = shufflevector <16 x i8> %2, <16 x i8> %1, <16 x i32> +; CHECK-NEXT: %4 = bitcast <16 x i8> %3 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %4 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertqi_shuffle_8123uuuu(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @test_insertqi_shuffle_8123uuuu +; CHECK-NEXT: %1 = bitcast <2 x i64> %i to <16 x i8> +; CHECK-NEXT: %2 = bitcast <2 x i64> %v to <16 x i8> +; CHECK-NEXT: %3 = shufflevector <16 x i8> %2, <16 x i8> %1, <16 x i32> +; CHECK-NEXT: %4 = bitcast <16 x i8> %3 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %4 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertqi_constant(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @test_insertqi_constant +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> , <2 x i64> , i8 16, i8 1) + ret <2 x i64> %1 +} + +; The result of this insert is the second arg, since the top 64 bits of +; the result are undefined, and we copy the bottom 64 bits from the +; second arg +define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @testInsert64Bits +; CHECK-NEXT: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) + ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> %i +; CHECK-LABEL: @testZeroLength +; CHECK-NEXT: ret <2 x i64> %i %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0) ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef +; CHECK-LABEL: @testUndefinedInsertq_1 +; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16) ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef +; CHECK-LABEL: @testUndefinedInsertq_2 +; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32) ret <2 x i64> %1 } -; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef +; CHECK-LABEL: @testUndefinedInsertq_3 +; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16) ret <2 x i64> %1 } @@ -125,7 +189,7 @@ ; Vector Demanded Bits ; -define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -134,7 +198,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_arg1 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -143,7 +207,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_args01 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -153,7 +217,7 @@ ret <2 x i64> %3 } -define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind @@ -161,7 +225,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) nounwind uwtable ssp { +define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) { ; CHECK-LABEL: @test_extrqi_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -170,7 +234,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrqi_ret(<2 x i64> %x) nounwind uwtable ssp { +define <2 x i64> @test_extrqi_ret(<2 x i64> %x) { ; CHECK-LABEL: @test_extrqi_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind @@ -178,7 +242,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertq_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -187,7 +251,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertq_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind @@ -195,7 +259,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -204,7 +268,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_arg1 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -213,7 +277,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_args01 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -223,7 +287,7 @@ ret <2 x i64> %3 } -define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind