Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -381,6 +381,197 @@ return nullptr; } +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits; + // in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector ShuffleMask; + for (int i = 0; i != Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt = Elt.lshr(Index).zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getParent()->getParent()->getParent(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->equalsInt(0)) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + + // From AMD documentation: "The bit index and field length are each six bits; + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector ShuffleMask; + for (int i = 0; i != Index; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 0; i != Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getParent()->getParent()->getParent(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit /// source vectors, unless a zero bit is set. If a zero bit is set, /// then ignore that half of the mask and clear that half of the vector. @@ -1005,14 +1196,29 @@ break; case Intrinsic::x86_sse4a_extrq: { - // EXTRQ uses only the lowest 64-bits of the first 128-bit vector - // operands and the lowest 16-bits of the second. Value *Op0 = II->getArgOperand(0); Value *Op1 = II->getArgOperand(1); unsigned VWidth0 = Op0->getType()->getVectorNumElements(); unsigned VWidth1 = Op1->getType()->getVectorNumElements(); - assert(VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes"); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast(Op1); + ConstantInt *CILength = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)1)) + : nullptr; + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { II->setArgOperand(0, V); return II; @@ -1025,13 +1231,24 @@ } case Intrinsic::x86_sse4a_extrqi: { - // EXTRQI uses only the lowest 64-bits of the first 128-bit vector - // operand. - Value *Op = II->getArgOperand(0); - unsigned VWidth = Op->getType()->getVectorNumElements(); - assert(VWidth == 2 && "Unexpected operand size"); + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II->getArgOperand(0); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast(II->getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast(II->getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) + return ReplaceInstUsesWith(*II, V); - if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) { + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { II->setArgOperand(0, V); return II; } @@ -1039,13 +1256,32 @@ } case Intrinsic::x86_sse4a_insertq: { - // INSERTQ uses only the lowest 64-bits of the first 128-bit vector - // operand. - Value *Op = II->getArgOperand(0); - unsigned VWidth = Op->getType()->getVectorNumElements(); - assert(VWidth == 2 && "Unexpected operand size"); + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + Op1->getType()->getVectorNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + APInt V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } - if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) { + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { II->setArgOperand(0, V); return II; } @@ -1053,49 +1289,31 @@ } case Intrinsic::x86_sse4a_insertqi: { - // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top - // ones undef - // TODO: eventually we should lower this intrinsic to IR - if (auto CILength = dyn_cast(II->getArgOperand(2))) { - if (auto CIIndex = dyn_cast(II->getArgOperand(3))) { - unsigned Index = CIIndex->getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = CILength->equalsInt(0) ? 64 : CILength->getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - - if (Length == 64 && Index == 0) { - Value *Vec = II->getArgOperand(1); - Value *Undef = UndefValue::get(Vec->getType()); - const uint32_t Mask[] = {0, 2}; - return ReplaceInstUsesWith( - CI, - Builder->CreateShuffleVector( - Vec, Undef, ConstantDataVector::get( - II->getContext(), makeArrayRef(Mask)))); - } - } - } - - // INSERTQI uses only the lowest 64-bits of the first two 128-bit vector - // operands. + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. Value *Op0 = II->getArgOperand(0); Value *Op1 = II->getArgOperand(1); unsigned VWidth0 = Op0->getType()->getVectorNumElements(); unsigned VWidth1 = Op1->getType()->getVectorNumElements(); - assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes"); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast(II->getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast(II->getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { II->setArgOperand(0, V); return II; Index: test/Transforms/InstCombine/x86-sse4a.ll =================================================================== --- test/Transforms/InstCombine/x86-sse4a.ll +++ test/Transforms/InstCombine/x86-sse4a.ll @@ -1,5 +1,180 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s +; +; EXTRQ +; + +define <2 x i64> @test_extrq_call(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_call +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrq_zero_arg0(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_zero_arg0 +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> zeroinitializer, <16 x i8> %y) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrq_zero_arg1(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_zero_arg1 +; CHECK-NEXT: ret <2 x i64> %x + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> zeroinitializer) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrq_to_extqi(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_to_extqi +; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 15) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> ) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrq_constant(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_constant +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> , <16 x i8> ) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrq_constant_undef(<2 x i64> %x, <16 x i8> %y) { +; CHECK-LABEL: @test_extrq_constant_undef +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> , <16 x i8> ) nounwind + ret <2 x i64> %1 +} + +; +; EXTRQI +; + +define <2 x i64> @test_extrqi_call(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_call +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_shuffle_1zuu(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_shuffle_1zuu +; CHECK-NEXT: %1 = bitcast <2 x i64> %x to <16 x i8> +; CHECK-NEXT: %2 = shufflevector <16 x i8> %1, <16 x i8> , <16 x i32> +; CHECK-NEXT: %3 = bitcast <16 x i8> %2 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %3 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 32, i8 32) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_shuffle_2zzzzzzzuuuuuuuu +; CHECK-NEXT: %1 = bitcast <2 x i64> %x to <16 x i8> +; CHECK-NEXT: %2 = shufflevector <16 x i8> %1, <16 x i8> , <16 x i32> +; CHECK-NEXT: %3 = bitcast <16 x i8> %2 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %3 + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 16) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_undef(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_undef +; CHECK-NEXT: ret <2 x i64> undef + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 32, i8 33) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_zero(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_zero +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 3, i8 18) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_constant(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_constant +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> , i8 3, i8 18) + ret <2 x i64> %1 +} + +define <2 x i64> @test_extrqi_constant_undef(<2 x i64> %x) { +; CHECK-LABEL: @test_extrqi_constant_undef +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> , i8 4, i8 18) + ret <2 x i64> %1 +} + +; +; INSERTQ +; + +define <2 x i64> @test_insertq_call(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: @test_insertq_call +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertq_to_insertqi(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: @test_insertq_to_insertqi +; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> , i8 18, i8 2) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> ) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertq_constant(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: @test_insertq_constant +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> , <2 x i64> ) nounwind + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertq_constant_undef(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: @test_insertq_constant_undef +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> , <2 x i64> ) nounwind + ret <2 x i64> %1 +} + +; +; INSERTQI +; + +define <2 x i64> @test_insertqi_shuffle_04uu(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @test_insertqi_shuffle_04uu +; CHECK-NEXT: %1 = bitcast <2 x i64> %i to <16 x i8> +; CHECK-NEXT: %2 = bitcast <2 x i64> %v to <16 x i8> +; CHECK-NEXT: %3 = shufflevector <16 x i8> %2, <16 x i8> %1, <16 x i32> +; CHECK-NEXT: %4 = bitcast <16 x i8> %3 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %4 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertqi_shuffle_8123uuuu(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @test_insertqi_shuffle_8123uuuu +; CHECK-NEXT: %1 = bitcast <2 x i64> %i to <16 x i8> +; CHECK-NEXT: %2 = bitcast <2 x i64> %v to <16 x i8> +; CHECK-NEXT: %3 = shufflevector <16 x i8> %2, <16 x i8> %1, <16 x i32> +; CHECK-NEXT: %4 = bitcast <16 x i8> %3 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %4 + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + ret <2 x i64> %1 +} + +define <2 x i64> @test_insertqi_constant(<2 x i64> %v, <2 x i64> %i) { +; CHECK-LABEL: @test_insertqi_constant +; CHECK-NEXT: ret <2 x i64> + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> , <2 x i64> , i8 16, i8 1) + ret <2 x i64> %1 +} + ; The result of this insert is the second arg, since the top 64 bits of ; the result are undefined, and we copy the bottom 64 bits from the ; second arg @@ -42,7 +217,7 @@ ; Vector Demanded Bits ; -define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -51,7 +226,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_arg1 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -60,7 +235,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_args01 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -70,7 +245,7 @@ ret <2 x i64> %3 } -define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp { +define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) { ; CHECK-LABEL: @test_extrq_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind @@ -78,7 +253,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) nounwind uwtable ssp { +define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) { ; CHECK-LABEL: @test_extrqi_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -87,7 +262,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_extrqi_ret(<2 x i64> %x) nounwind uwtable ssp { +define <2 x i64> @test_extrqi_ret(<2 x i64> %x) { ; CHECK-LABEL: @test_extrqi_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind @@ -95,7 +270,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertq_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) ; CHECK-NEXT: ret <2 x i64> %1 @@ -104,7 +279,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertq_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind @@ -112,7 +287,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_arg0 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -121,7 +296,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_arg1 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -130,7 +305,7 @@ ret <2 x i64> %2 } -define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_args01 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) ; CHECK-NEXT: ret <2 x i64> %1 @@ -140,7 +315,7 @@ ret <2 x i64> %3 } -define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp { +define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @test_insertqi_ret ; CHECK-NEXT: ret <2 x i64> undef %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind