Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -968,14 +968,49 @@
     break;
 
   case Intrinsic::x86_sse4a_extrq: {
-    // EXTRQ uses only the lowest 64-bits of the first 128-bit vector
-    // operands and the lowest 16-bits of the second.
     Value *Op0 = II->getArgOperand(0);
     Value *Op1 = II->getArgOperand(1);
     unsigned VWidth0 = Op0->getType()->getVectorNumElements();
     unsigned VWidth1 = Op1->getType()->getVectorNumElements();
     assert(VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes");
 
+    // See if we're dealing with constant values.
+    Constant *C0 = dyn_cast<Constant>(Op0);
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CI0 =
+        C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+           : nullptr;
+    ConstantInt *CILength =
+        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0))
+           : nullptr;
+    ConstantInt *CIIndex =
+        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Constant Fold - if Op1 is constant - convert to extrqi.
+    if (CILength && CIIndex) {
+      unsigned Length = CILength->getValue().zextOrTrunc(6).getZExtValue();
+      unsigned Index = CIIndex->getValue().zextOrTrunc(6).getZExtValue();
+
+      Type *IntTy8 = Type::getInt8Ty(II->getContext());
+      Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+      Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+      Value *Args[] = {Op0, CILength, CIIndex};
+      Module *M = CI.getParent()->getParent()->getParent();
+      Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+      return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
+    }
+
+    // Constant Fold - extraction from zero is always {zero, undef}.
+    if (CI0 && CI0->equalsInt(0)) {
+      Type *IntTy64 = Type::getInt64Ty(II->getContext());
+      Constant *Args[] = {ConstantInt::get(IntTy64, 0),
+                          UndefValue::get(IntTy64)};
+      return ReplaceInstUsesWith(CI, ConstantVector::get(Args));
+    }
+
+    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+    // operands and the lowest 16-bits of the second.
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
       II->setArgOperand(0, V);
       return II;
@@ -988,13 +1023,91 @@
   }
 
   case Intrinsic::x86_sse4a_extrqi: {
-    // EXTRQI uses only the lowest 64-bits of the first 128-bit vector
-    // operand.
-    Value *Op = II->getArgOperand(0);
-    unsigned VWidth = Op->getType()->getVectorNumElements();
-    assert(VWidth == 2 && "Unexpected operand size");
+    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+    // bits of the lower 64-bits. The upper 64-bits are undefined.
+    Value *Op0 = II->getArgOperand(0);
+    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           "Unexpected operand size");
+
+    auto LowConstantHighUndef = [&](uint64_t Val) {
+      Type *IntTy64 = Type::getInt64Ty(II->getContext());
+      Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+                          UndefValue::get(IntTy64)};
+      return ReplaceInstUsesWith(CI, ConstantVector::get(Args));
+    };
+
+    // See if we're dealing with constant values.
+    Constant *C0 = dyn_cast<Constant>(Op0);
+    ConstantInt *CI0 =
+        C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+           : nullptr;
+
+    if (ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+        unsigned Index = CIIndex->getZExtValue();
+
+        // From AMD documentation: "a value of zero in the field length is
+        // defined as length of 64".
+        unsigned Length =
+            CILength->equalsInt(0) ? 64 : CILength->getZExtValue();
+
+        // From AMD documentation: "If the sum of the bit index + length field
+        // is greater than 64, the results are undefined".
+        unsigned End = Index + Length;
+
+        // Note that both field index and field length are 8-bit quantities.
+        // Since variables 'Index' and 'Length' are unsigned values
+        // obtained from zero-extending field index and field length
+        // respectively, their sum should never wrap around.
+        if (End > 64)
+          return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+        // If we are inserting whole bytes, we can convert this to a shuffle.
+        // Lowering can recognize EXTRQI shuffle masks.
+        if ((Length % 8) == 0 && (Index % 8) == 0) {
+          // Convert bit indices to byte indices.
+          Length /= 8;
+          Index /= 8;
+
+          Type *IntTy8 = Type::getInt8Ty(II->getContext());
+          Type *IntTy32 = Type::getInt32Ty(II->getContext());
+          VectorType *ShufTy = VectorType::get(IntTy8, 16);
+          SmallVector<Constant *, 16> ShuffleMask;
+          for (int i = 0; i != Length; ++i)
+            ShuffleMask.push_back(
+                Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
+          for (int i = Length; i != 8; ++i)
+            ShuffleMask.push_back(
+                Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+          for (int i = 8; i != 16; ++i)
+            ShuffleMask.push_back(UndefValue::get(IntTy32));
+
+          Value *SV =
+              Builder->CreateShuffleVector(Builder->CreateBitCast(Op0, ShufTy),
+                                           ConstantAggregateZero::get(ShufTy),
+                                           ConstantVector::get(ShuffleMask));
+          return ReplaceInstUsesWith(CI,
+                                     Builder->CreateBitCast(SV, II->getType()));
+        }
+
+        // Constant Fold - shift Index'th bit to lowest position and mask off
+        // Length bits.
+        if (CI0) {
+          APInt Elt = CI0->getValue();
+          Elt = Elt.lshr(Index).zextOrTrunc(Length);
+          return LowConstantHighUndef(Elt.getZExtValue());
+        }
+      }
+    }
 
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) {
+    // Constant Fold - extraction from zero is always {zero, undef}.
+    if (CI0 && CI0->equalsInt(0))
+      return LowConstantHighUndef(0);
+
+    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
       II->setArgOperand(0, V);
       return II;
     }
@@ -1002,13 +1115,36 @@
   }
 
   case Intrinsic::x86_sse4a_insertq: {
-    // INSERTQ uses only the lowest 64-bits of the first 128-bit vector
-    // operand.
-    Value *Op = II->getArgOperand(0);
-    unsigned VWidth = Op->getType()->getVectorNumElements();
-    assert(VWidth == 2 && "Unexpected operand size");
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    assert(Op1->getType()->getVectorNumElements() == 2 && VWidth == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CI11 =
+        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Constant Fold - if Op1 is constant - convert to insertqi.
+    if (CI11) {
+      APInt V11 = CI11->getValue();
+      unsigned Length = V11.zextOrTrunc(6).getZExtValue();
+      unsigned Index = V11.lshr(8).zextOrTrunc(6).getZExtValue();
+
+      Type *IntTy8 = Type::getInt8Ty(II->getContext());
+      Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+      Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+      Value *Args[] = {Op0, Op1, CILength, CIIndex};
+      Module *M = CI.getParent()->getParent()->getParent();
+      Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+      return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
+    }
 
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) {
+    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
       II->setArgOperand(0, V);
       return II;
     }
@@ -1016,15 +1152,23 @@
   }
 
   case Intrinsic::x86_sse4a_insertqi: {
-    // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
-    // ones undef
-    // TODO: eventually we should lower this intrinsic to IR
+    // INSERTQI: Extract lowest Length bits from lower half of second source and
+    // insert over first source starting at Index bit. The upper 64-bits are
+    // undefined.
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
+    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes");
+
     if (auto CILength = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
       if (auto CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
         unsigned Index = CIIndex->getZExtValue();
+
         // From AMD documentation: "a value of zero in the field length is
         // defined as length of 64".
-        unsigned Length = CILength->equalsInt(0) ? 64 : CILength->getZExtValue();
+        unsigned Length =
+            CILength->equalsInt(0) ? 64 : CILength->getZExtValue();
 
         // From AMD documentation: "If the sum of the bit index + length field
         // is greater than 64, the results are undefined".
@@ -1037,69 +1181,66 @@
         if (End > 64)
           return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
 
-        if (Length == 64 && Index == 0) {
-          Value *Vec = II->getArgOperand(1);
-          Value *Undef = UndefValue::get(Vec->getType());
-          const uint32_t Mask[] = { 0, 2 };
-          return ReplaceInstUsesWith(
-              CI,
-              Builder->CreateShuffleVector(
-                  Vec, Undef, ConstantDataVector::get(
-                                  II->getContext(), makeArrayRef(Mask))));
-        } else if (auto Source =
-                       dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
-          if (Source->hasOneUse() &&
-              Source->getArgOperand(1) == II->getArgOperand(1)) {
-            // If the source of the insert has only one use and it's another
-            // insert (and they're both inserting from the same vector), try to
-            // bundle both together.
-            auto CISourceLength =
-                dyn_cast<ConstantInt>(Source->getArgOperand(2));
-            auto CISourceIndex =
-                dyn_cast<ConstantInt>(Source->getArgOperand(3));
-            if (CISourceIndex && CISourceLength) {
-              unsigned SourceIndex = CISourceIndex->getZExtValue();
-              unsigned SourceLength = CISourceLength->getZExtValue();
-              unsigned SourceEnd = SourceIndex + SourceLength;
-              unsigned NewIndex, NewLength;
-              bool ShouldReplace = false;
-              if (Index <= SourceIndex && SourceIndex <= End) {
-                NewIndex = Index;
-                NewLength = std::max(End, SourceEnd) - NewIndex;
-                ShouldReplace = true;
-              } else if (SourceIndex <= Index && Index <= SourceEnd) {
-                NewIndex = SourceIndex;
-                NewLength = std::max(SourceEnd, End) - NewIndex;
-                ShouldReplace = true;
-              }
-
-              if (ShouldReplace) {
-                Constant *ConstantLength = ConstantInt::get(
-                    II->getArgOperand(2)->getType(), NewLength, false);
-                Constant *ConstantIndex = ConstantInt::get(
-                    II->getArgOperand(3)->getType(), NewIndex, false);
-                Value *Args[4] = { Source->getArgOperand(0),
-                                   II->getArgOperand(1), ConstantLength,
-                                   ConstantIndex };
-                Module *M = CI.getParent()->getParent()->getParent();
-                Value *F =
-                    Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
-                return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
-              }
-            }
-          }
+        // If we are inserting whole bytes, we can convert this to a shuffle.
+        // Lowering can recognize INSERTQI shuffle masks.
+        if ((Length % 8) == 0 && (Index % 8) == 0) {
+          // Convert bit indices to byte indices.
+          Length /= 8;
+          Index /= 8;
+
+          Type *IntTy8 = Type::getInt8Ty(II->getContext());
+          Type *IntTy32 = Type::getInt32Ty(II->getContext());
+          VectorType *ShufTy = VectorType::get(IntTy8, 16);
+          SmallVector<Constant *, 16> ShuffleMask;
+          for (int i = 0; i != Index; ++i)
+            ShuffleMask.push_back(
+                Constant::getIntegerValue(IntTy32, APInt(32, i)));
+          for (int i = 0; i != Length; ++i)
+            ShuffleMask.push_back(
+                Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+          for (int i = Index + Length; i != 8; ++i)
+            ShuffleMask.push_back(
+                Constant::getIntegerValue(IntTy32, APInt(32, i)));
+          for (int i = 8; i != 16; ++i)
+            ShuffleMask.push_back(UndefValue::get(IntTy32));
+
+          Value *SV =
+              Builder->CreateShuffleVector(Builder->CreateBitCast(Op0, ShufTy),
+                                           Builder->CreateBitCast(Op1, ShufTy),
+                                           ConstantVector::get(ShuffleMask));
+          return ReplaceInstUsesWith(CI,
+                                     Builder->CreateBitCast(SV, II->getType()));
+        }
+
+        // See if we're dealing with constant values.
+        Constant *C0 = dyn_cast<Constant>(Op0);
+        Constant *C1 = dyn_cast<Constant>(Op1);
+        ConstantInt *CI00 =
+            C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+               : nullptr;
+        ConstantInt *CI10 =
+            C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0))
+               : nullptr;
+
+        // Constant Fold - insert bottom Length bits starting at the Index'th
+        // bit.
+        if (CI00 && CI10) {
+          APInt V00 = CI00->getValue();
+          APInt V10 = CI10->getValue();
+          APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+          V00 = V00 & ~Mask;
+          V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+          APInt Val = V00 | V10;
+          Type *IntTy64 = Type::getInt64Ty(II->getContext());
+          Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+                              UndefValue::get(IntTy64)};
+          return ReplaceInstUsesWith(CI, ConstantVector::get(Args));
         }
       }
     }
 
-    // INSERTQI uses only the lowest 64-bits of the first two 128-bit vector
+    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
     // operands.
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
-    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
-    assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes");
-
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
       II->setArgOperand(0, V);
       return II;
@@ -1183,9 +1324,9 @@
         // control mask is set, then zero is written in the result byte.
         // The zero vector is in the right-hand side of the resulting
         // shufflevector.
- 
+
         // The value of each index is the least significant 4 bits of the
-        // shuffle control byte.      
+        // shuffle control byte.
         Indexes[I] = (Index < 0) ? NumElts : Index & 0xF;
       }
     } else if (!isa<ConstantAggregateZero>(V))
Index: test/Transforms/InstCombine/x86-sse4a.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse4a.ll
+++ test/Transforms/InstCombine/x86-sse4a.ll
@@ -1,122 +1,186 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; We should optimize these two redundant insertqi into one
-; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
-; CHECK-NOT: insertqi
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
-  ret <2 x i64> %2
+;
+; EXTRQ
+;
+
+define <2 x i64> @test_extrq_call(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_call
+; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
+; CHECK-NEXT: ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
+  ret <2 x i64> %1
 }
 
-; The result of this insert is the second arg, since the top 64 bits of
-; the result are undefined, and we copy the bottom 64 bits from the
-; second arg
-; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: ret <2 x i64> %i
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
+define <2 x i64> @test_extrq_zero_arg0(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_zero_arg0
+; CHECK-NEXT: ret <2 x i64> <i64 0, i64 undef>
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> zeroinitializer, <16 x i8> %y) nounwind
   ret <2 x i64> %1
 }
 
-; Test the several types of ranges and ordering that exist for two insertqi
-; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-; CHECK: ret <2 x i64> %[[RES]]
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
-  ret <2 x i64> %2
+define <2 x i64> @test_extrq_zero_arg1(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_zero_arg1
+; CHECK-NEXT: ret <2 x i64> %x
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> zeroinitializer) nounwind
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-; CHECK: ret <2 x i64> %[[RES]]
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
-  ret <2 x i64> %2
+define <2 x i64> @test_extrq_to_extqi(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_to_extqi
+; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 15)
+; CHECK-NEXT: ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK: ret <2 x i64> %[[RES]]
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
-  ret <2 x i64> %2
+;
+; EXTRQI
+;
+
+define <2 x i64> @test_extrqi_call(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_call
+; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
+; CHECK-NEXT: ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK: ret <2 x i64> %[[RES]]
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
-  ret <2 x i64> %2
+define <2 x i64> @test_extrqi_shuffle_1zuu(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_shuffle_1zuu
+; CHECK-NEXT: %1 = bitcast <2 x i64> %x to <16 x i8>
+; CHECK-NEXT: %2 = shufflevector <16 x i8> %1, <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: %3 = bitcast <16 x i8> %2 to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> %3
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 32, i8 32)
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK: ret <2 x i64> %[[RES]]
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  ret <2 x i64> %2
+define <2 x i64> @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_shuffle_2zzzzzzzuuuuuuuu
+; CHECK-NEXT: %1 = bitcast <2 x i64> %x to <16 x i8>
+; CHECK-NEXT: %2 = shufflevector <16 x i8> %1, <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 2, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: %3 = bitcast <16 x i8> %2 to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> %3
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 16)
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK: ret <2 x i64> %[[RES]]
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
-  ret <2 x i64> %2
+define <2 x i64> @test_extrqi_undef(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_undef
+; CHECK-NEXT: ret <2 x i64> undef
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 32, i8 33)
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  ret <2 x i64> %2
+define <2 x i64> @test_extrqi_zero(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_zero
+; CHECK-NEXT: ret <2 x i64> <i64 0, i64 undef>
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 3, i8 18)
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
-define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+define <2 x i64> @test_extrqi_constant(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_constant
+; CHECK-NEXT: ret <2 x i64> <i64 7, i64 undef>
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 55>, i8 3, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_constant_undef(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_constant_undef
+; CHECK-NEXT: ret <2 x i64> <i64 7, i64 undef>
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 undef>, i8 3, i8 18)
+  ret <2 x i64> %1
+}
+
+;
+; INSERTQ
+;
+
+define <2 x i64> @test_insertq_call(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_call
+; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
+; CHECK-NEXT: ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_to_insertqi(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_to_insertqi
+; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 8, i64 undef>, i8 18, i8 2)
+; CHECK-NEXT: ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+;
+; INSERTQI
+;
+
+define <2 x i64> @test_insertqi_shuffle_04uu(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @test_insertqi_shuffle_04uu
+; CHECK-NEXT: %1 = bitcast <2 x i64> %i to <16 x i8>
+; CHECK-NEXT: %2 = bitcast <2 x i64> %v to <16 x i8>
+; CHECK-NEXT: %3 = shufflevector <16 x i8> %2, <16 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: %4 = bitcast <16 x i8> %3 to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> %4
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertqi_shuffle_8123uuuu(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @test_insertqi_shuffle_8123uuuu
+; CHECK-NEXT: %1 = bitcast <2 x i64> %i to <16 x i8>
+; CHECK-NEXT: %2 = bitcast <2 x i64> %v to <16 x i8>
+; CHECK-NEXT: %3 = shufflevector <16 x i8> %2, <16 x i8> %1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: %4 = bitcast <16 x i8> %3 to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> %4
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  ret <2 x i64> %2
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertqi_constant(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @test_insertqi_constant
+; CHECK-NEXT: ret <2 x i64> <i64 -131055, i64 undef>
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 8, i64 0>, i8 16, i8 1)
+  ret <2 x i64> %1
+}
+
+; The result of this insert is the second arg, since the top 64 bits of
+; the result are undefined, and we copy the bottom 64 bits from the
+; second arg
+define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testInsert64Bits
+; CHECK-NEXT: ret <2 x i64> %i
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
+  ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i)
 define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: ret <2 x i64> %i
+; CHECK-LABEL: @testZeroLength
+; CHECK-NEXT: ret <2 x i64> %i
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
   ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i)
 define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: ret <2 x i64> undef
+; CHECK-LABEL: @testUndefinedInsertq_1
+; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
   ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i)
 define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: ret <2 x i64> undef
+; CHECK-LABEL: @testUndefinedInsertq_2
+; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
   ret <2 x i64> %1
 }
 
-; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i)
 define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
-; CHECK: ret <2 x i64> undef
+; CHECK-LABEL: @testUndefinedInsertq_3
+; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
   ret <2 x i64> %1
 }
@@ -125,7 +189,7 @@
 ; Vector Demanded Bits
 ;
 
-define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
+define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @test_extrq_arg0
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -134,7 +198,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
+define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @test_extrq_arg1
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -143,7 +207,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
+define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @test_extrq_args01
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -153,7 +217,7 @@
   ret <2 x i64> %3
 }
 
-define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
+define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @test_extrq_ret
 ; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
@@ -161,7 +225,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) nounwind uwtable ssp {
+define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) {
 ; CHECK-LABEL: @test_extrqi_arg0
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -170,7 +234,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_extrqi_ret(<2 x i64> %x) nounwind uwtable ssp {
+define <2 x i64> @test_extrqi_ret(<2 x i64> %x) {
 ; CHECK-LABEL: @test_extrqi_ret
 ; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind
@@ -178,7 +242,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @test_insertq_arg0
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -187,7 +251,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @test_insertq_ret
 ; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
@@ -195,7 +259,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @test_insertqi_arg0
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -204,7 +268,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @test_insertqi_arg1
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -213,7 +277,7 @@
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @test_insertqi_args01
 ; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2)
 ; CHECK-NEXT: ret <2 x i64> %1
@@ -223,7 +287,7 @@
   ret <2 x i64> %3
 }
 
-define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @test_insertqi_ret
 ; CHECK-NEXT: ret <2 x i64> undef
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind