diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll
--- a/clang/test/CodeGen/thinlto-distributed-newpm.ll
+++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll
@@ -62,13 +62,13 @@
 ; CHECK-O: Running pass: InstCombinePass on main
 ; CHECK-O: Running analysis: TargetLibraryAnalysis on main
 ; CHECK-O: Running analysis: OptimizationRemarkEmitterAnalysis on main
+; CHECK-O: Running analysis: TargetIRAnalysis on main
 ; CHECK-O: Running analysis: AAManager on main
 ; CHECK-O: Running analysis: BasicAA on main
 ; CHECK-O: Running analysis: ScopedNoAliasAA on main
 ; CHECK-O: Running analysis: TypeBasedAA on main
 ; CHECK-O: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O: Running pass: SimplifyCFGPass on main
-; CHECK-O: Running analysis: TargetIRAnalysis on main
 ; CHECK-O: Finished {{.*}}Function pass manager run.
 ; CHECK-O: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
 ; CHECK-O: Running analysis: GlobalsAA
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -42,6 +42,7 @@
 class ExtractElementInst;
 class Function;
 class GlobalValue;
+class InstCombiner;
 class IntrinsicInst;
 class LoadInst;
 class LoopAccessInfo;
@@ -56,6 +57,7 @@
 class Type;
 class User;
 class Value;
+struct KnownBits;
 template <typename T> class Optional;
 
 /// Information about a load/store intrinsic defined by the target.
@@ -542,6 +544,29 @@
   /// target-independent defaults with information from \p L and \p SE.
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              PeelingPreferences &PP) const;
+
+  /// Targets can implement their own combinations for target-specific
+  /// intrinsics. This function will be called from the InstCombine pass every
+  /// time a target-specific intrinsic is encountered.
+  ///
+  /// \returns None to not do anything target specific or a value that will be
+  /// returned from the InstCombiner. It is possible to return null and stop
+  /// further processing of the intrinsic by returning nullptr.
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  /// Can be used to implement target-specific instruction combining.
+  /// \see instCombineIntrinsic
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const;
+  /// Can be used to implement target-specific instruction combining.
+  /// \see instCombineIntrinsic
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
   /// @}
 
   /// \name Scalar Target Information
@@ -1301,6 +1326,17 @@
                               AssumptionCache &AC, TargetLibraryInfo *TLI,
                               DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
   virtual bool emitGetActiveLaneMask() = 0;
+  virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                                       IntrinsicInst &II) = 0;
+  virtual Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) = 0;
+  virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1588,6 +1624,26 @@
   bool emitGetActiveLaneMask() override {
     return Impl.emitGetActiveLaneMask();
   }
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) override {
+    return Impl.instCombineIntrinsic(IC, II);
+  }
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) override {
+    return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                 KnownBitsComputed);
+  }
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) override {
+    return Impl.simplifyDemandedVectorEltsIntrinsic(
+        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -147,6 +147,26 @@
     return false;
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const {
+    return None;
+  }
+
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const {
+    return None;
+  }
+
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const {
+    return None;
+  }
+
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
                                TTI::UnrollingPreferences &) {}
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -477,6 +477,30 @@
     return BaseT::emitGetActiveLaneMask();
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) {
+    return BaseT::instCombineIntrinsic(IC, II);
+  }
+
+  Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
+                                                     IntrinsicInst &II,
+                                                     APInt DemandedMask,
+                                                     KnownBits &Known,
+                                                     bool &KnownBitsComputed) {
+    return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                   KnownBitsComputed);
+  }
+
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) {
+    return BaseT::simplifyDemandedVectorEltsIntrinsic(
+        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
+
   int getInstructionLatency(const Instruction *I) {
     if (isa<LoadInst>(I))
       return getST()->getSchedModel().DefaultLoadLatency;
@@ -1605,7 +1629,7 @@
       }
     }
 
-    auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
+    auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
     if (MinLegalCostI != LegalCost.end())
       return *MinLegalCostI;
 
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -199,6 +199,11 @@
   /// returns Intrinsic::not_intrinsic!
   bool isIntrinsic() const { return HasLLVMReservedName; }
 
+  /// isTargetIntrinsic - Returns true if this function is an intrinsic and the
+  /// intrinsic is specific to a certain target. If this is not an intrinsic
+  /// or a generic intrinsic, false is returned.
+  bool isTargetIntrinsic() const;
+
   /// Returns true if the function is one of the "Constrained Floating-Point
   /// Intrinsics". Returns false if not, and returns false when
   /// getIntrinsicID() returns Intrinsic::not_intrinsic.
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -0,0 +1,518 @@
+//===- InstCombiner.h - InstCombine implementation --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the interface for the instcombine pass implementation.
+/// The interface is used for generic transformations in this folder and
+/// target specific combinations in the targets.
+/// The visitor implementation is in \c InstCombinerImpl in
+/// \c InstCombineInternal.h.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H
+#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H
+
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include <cassert>
+
+#define DEBUG_TYPE "instcombine"
+
+namespace llvm {
+
+class AAResults;
+class AssumptionCache;
+class ProfileSummaryInfo;
+class TargetLibraryInfo;
+class TargetTransformInfo;
+struct KnownBits;
+
+/// The core instruction combiner logic.
+///
+/// This class provides both the logic to recursively visit instructions and
+/// combine them.
+class LLVM_LIBRARY_VISIBILITY InstCombiner {
+  /// Only used to call target specific inst combining.
+  TargetTransformInfo &TTI;
+
+public:
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine = 0;
+
+  /// An IRBuilder that automatically inserts new instructions into the
+  /// worklist.
+  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
+  BuilderTy &Builder;
+
+protected:
+  /// A worklist of the instructions that need to be simplified.
+  InstCombineWorklist &Worklist;
+
+  // Mode in which we are running the combiner.
+  const bool MinimizeSize;
+
+  AAResults *AA;
+
+  // Required analyses.
+  AssumptionCache &AC;
+  TargetLibraryInfo &TLI;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  const SimplifyQuery SQ;
+  OptimizationRemarkEmitter &ORE;
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // Optional analyses. When non-null, these can both be used to do better
+  // combining and will be updated to reflect any changes.
+  LoopInfo *LI;
+
+  bool MadeIRChange = false;
+
+public:
+  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
+               bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+               TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+               DominatorTree &DT, OptimizationRemarkEmitter &ORE,
+               BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+               const DataLayout &DL, LoopInfo *LI)
+      : TTI(TTI), Builder(Builder), Worklist(Worklist),
+        MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL),
+        SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
+
+  virtual ~InstCombiner() {}
+
+  /// Return the source operand of a potentially bitcasted value while
+  /// optionally checking if it has one use. If there is no bitcast or the one
+  /// use check is not met, return the input value itself.
+  static Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
+    if (auto *BitCast = dyn_cast<BitCastInst>(V))
+      if (!OneUseOnly || BitCast->hasOneUse())
+        return BitCast->getOperand(0);
+
+    // V is not a bitcast or V has more than one use and OneUseOnly is true.
+    return V;
+  }
+
+  /// Assign a complexity or rank value to LLVM Values. This is used to reduce
+  /// the amount of pattern matching needed for compares and commutative
+  /// instructions. For example, if we have:
+  ///   icmp ugt X, Constant
+  /// or
+  ///   xor (add X, Constant), cast Z
+  ///
+  /// We do not have to consider the commuted variants of these patterns because
+  /// canonicalization based on complexity guarantees the above ordering.
+  ///
+  /// This routine maps IR values to various complexity ranks:
+  ///   0 -> undef
+  ///   1 -> Constants
+  ///   2 -> Other non-instructions
+  ///   3 -> Arguments
+  ///   4 -> Cast and (f)neg/not instructions
+  ///   5 -> Other instructions
+  static unsigned getComplexity(Value *V) {
+    if (isa<Instruction>(V)) {
+      if (isa<CastInst>(V) || match(V, m_Neg(PatternMatch::m_Value())) ||
+          match(V, m_Not(PatternMatch::m_Value())) ||
+          match(V, m_FNeg(PatternMatch::m_Value())))
+        return 4;
+      return 5;
+    }
+    if (isa<Argument>(V))
+      return 3;
+    return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+  }
+
+  /// Predicate canonicalization reduces the number of patterns that need to be
+  /// matched by other transforms. For example, we may swap the operands of a
+  /// conditional branch or select to create a compare with a canonical
+  /// (inverted) predicate which is then more likely to be matched with other
+  /// values.
+  static bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+    switch (Pred) {
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+    // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_OGE:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  /// Given an exploded icmp instruction, return true if the comparison only
+  /// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if
+  /// the result of the comparison is true when the input value is signed.
+  static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
+                             bool &TrueIfSigned) {
+    switch (Pred) {
+    case ICmpInst::ICMP_SLT: // True if LHS s< 0
+      TrueIfSigned = true;
+      return RHS.isNullValue();
+    case ICmpInst::ICMP_SLE: // True if LHS s<= -1
+      TrueIfSigned = true;
+      return RHS.isAllOnesValue();
+    case ICmpInst::ICMP_SGT: // True if LHS s> -1
+      TrueIfSigned = false;
+      return RHS.isAllOnesValue();
+    case ICmpInst::ICMP_SGE: // True if LHS s>= 0
+      TrueIfSigned = false;
+      return RHS.isNullValue();
+    case ICmpInst::ICMP_UGT:
+      // True if LHS u> RHS and RHS == sign-bit-mask - 1
+      TrueIfSigned = true;
+      return RHS.isMaxSignedValue();
+    case ICmpInst::ICMP_UGE:
+      // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+      TrueIfSigned = true;
+      return RHS.isMinSignedValue();
+    case ICmpInst::ICMP_ULT:
+      // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+      TrueIfSigned = false;
+      return RHS.isMinSignedValue();
+    case ICmpInst::ICMP_ULE:
+      // True if LHS u<= RHS and RHS == sign-bit-mask - 1
+      TrueIfSigned = false;
+      return RHS.isMaxSignedValue();
+    default:
+      return false;
+    }
+  }
+
+  /// Add one to a Constant
+  static Constant *AddOne(Constant *C) {
+    return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+  }
+
+  /// Subtract one from a Constant
+  static Constant *SubOne(Constant *C) {
+    return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
+  }
+
+  llvm::Optional<std::pair<
+      CmpInst::Predicate,
+      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpInst::
+                                                                       Predicate
+                                                                           Pred,
+                                                                   Constant *C);
+
+  /// Return true if the specified value is free to invert (apply ~ to).
+  /// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
+  /// is true, work under the assumption that the caller intends to remove all
+  /// uses of V and only keep uses of ~V.
+  ///
+  /// See also: canFreelyInvertAllUsersOf()
+  static bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
+    // ~(~(X)) -> X.
+    if (match(V, m_Not(PatternMatch::m_Value())))
+      return true;
+
+    // Constants can be considered to be not'ed values.
+    if (match(V, PatternMatch::m_AnyIntegralConstant()))
+      return true;
+
+    // Compares can be inverted if all of their uses are being modified to use
+    // the ~V.
+    if (isa<CmpInst>(V))
+      return WillInvertAllUses;
+
+    // If `V` is of the form `A + Constant` then `-1 - V` can be folded into
+    // `(-1 - Constant) - A` if we are willing to invert all of the uses.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Sub)
+        if (isa<Constant>(BO->getOperand(0)) ||
+            isa<Constant>(BO->getOperand(1)))
+          return WillInvertAllUses;
+
+    // Selects with invertible operands are freely invertible
+    if (match(V,
+              m_Select(PatternMatch::m_Value(), m_Not(PatternMatch::m_Value()),
+                       m_Not(PatternMatch::m_Value()))))
+      return WillInvertAllUses;
+
+    return false;
+  }
+
+  /// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
+  /// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this
+  /// fn.
+  ///
+  /// See also: isFreeToInvert()
+  static bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
+    // Look at every user of V.
+    for (Use &U : V->uses()) {
+      if (U.getUser() == IgnoredUser)
+        continue; // Don't consider this user.
+
+      auto *I = cast<Instruction>(U.getUser());
+      switch (I->getOpcode()) {
+      case Instruction::Select:
+        if (U.getOperandNo() != 0) // Only if the value is used as select cond.
+          return false;
+        break;
+      case Instruction::Br:
+        assert(U.getOperandNo() == 0 && "Must be branching on that value.");
+        break; // Free to invert by swapping true/false values/destinations.
+      case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring
+                             // it.
+        if (!match(I, m_Not(PatternMatch::m_Value())))
+          return false; // Not a 'not'.
+        break;
+      default:
+        return false; // Don't know, likely not freely invertible.
+      }
+      // So far all users were free to invert...
+    }
+    return true; // Can freely invert all users!
+  }
+
+  /// Some binary operators require special handling to avoid poison and
+  /// undefined behavior. If a constant vector has undef elements, replace those
+  /// undefs with identity constants if possible because those are always safe
+  /// to execute. If no identity constant exists, replace undef with some other
+  /// safe constant.
+  static Constant *
+  getSafeVectorConstantForBinop(BinaryOperator::BinaryOps Opcode, Constant *In,
+                                bool IsRHSConstant) {
+    auto *InVTy = dyn_cast<VectorType>(In->getType());
+    assert(InVTy && "Not expecting scalars here");
+
+    Type *EltTy = InVTy->getElementType();
+    auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
+    if (!SafeC) {
+      // TODO: Should this be available as a constant utility function? It is
+      // similar to getBinOpAbsorber().
+      if (IsRHSConstant) {
+        switch (Opcode) {
+        case Instruction::SRem: // X % 1 = 0
+        case Instruction::URem: // X %u 1 = 0
+          SafeC = ConstantInt::get(EltTy, 1);
+          break;
+        case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
+          SafeC = ConstantFP::get(EltTy, 1.0);
+          break;
+        default:
+          llvm_unreachable(
+              "Only rem opcodes have no identity constant for RHS");
+        }
+      } else {
+        switch (Opcode) {
+        case Instruction::Shl:  // 0 << X = 0
+        case Instruction::LShr: // 0 >>u X = 0
+        case Instruction::AShr: // 0 >> X = 0
+        case Instruction::SDiv: // 0 / X = 0
+        case Instruction::UDiv: // 0 /u X = 0
+        case Instruction::SRem: // 0 % X = 0
+        case Instruction::URem: // 0 %u X = 0
+        case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
+        case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
+        case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
+        case Instruction::FRem: // 0.0 % X = 0
+          SafeC = Constant::getNullValue(EltTy);
+          break;
+        default:
+          llvm_unreachable("Expected to find identity constant for opcode");
+        }
+      }
+    }
+    assert(SafeC && "Must have safe constant for binop");
+    unsigned NumElts = InVTy->getNumElements();
+    SmallVector<Constant *, 16> Out(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *C = In->getAggregateElement(i);
+      Out[i] = isa<UndefValue>(C) ? SafeC : C;
+    }
+    return ConstantVector::get(Out);
+  }
+
+  /// Create and insert the idiom we use to indicate a block is unreachable
+  /// without having to rewrite the CFG from within InstCombine.
+  static void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
+    auto &Ctx = InsertAt->getContext();
+    new StoreInst(ConstantInt::getTrue(Ctx),
+                  UndefValue::get(Type::getInt1PtrTy(Ctx)), InsertAt);
+  }
+
+  void addToWorklist(Instruction *I) { Worklist.push(I); }
+
+  AssumptionCache &getAssumptionCache() const { return AC; }
+  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
+  DominatorTree &getDominatorTree() const { return DT; }
+  const DataLayout &getDataLayout() const { return DL; }
+  const SimplifyQuery &getSimplifyQuery() const { return SQ; }
+  OptimizationRemarkEmitter &getOptimizationRemarkEmitter() const {
+    return ORE;
+  }
+  BlockFrequencyInfo *getBlockFrequencyInfo() const { return BFI; }
+  ProfileSummaryInfo *getProfileSummaryInfo() const { return PSI; }
+  LoopInfo *getLoopInfo() const { return LI; }
+
+  // Call target specific combiners
+  Optional<Instruction *> targetInstCombineIntrinsic(IntrinsicInst &II);
+  Optional<Value *>
+  targetSimplifyDemandedUseBitsIntrinsic(IntrinsicInst &II, APInt DemandedMask,
+                                         KnownBits &Known,
+                                         bool &KnownBitsComputed);
+  Optional<Value *> targetSimplifyDemandedVectorEltsIntrinsic(
+      IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp);
+
+  /// Inserts an instruction \p New before instruction \p Old
+  ///
+  /// Also adds the new instruction to the worklist and returns \p New so that
+  /// it is suitable for use as the return from the visitation patterns.
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && !New->getParent() &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(Old.getIterator(), New); // Insert inst
+    Worklist.push(New);
+    return New;
+  }
+
+  /// Same as InsertNewInstBefore, but also sets the debug loc.
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  /// A combiner-aware RAUW-like routine.
+  ///
+  /// This method is to be used when an instruction is found to be dead,
+  /// replaceable with another preexisting expression. Here we add all uses of
+  /// I to the worklist, replace all uses of I with the new value, then return
+  /// I, so that the inst combiner will know that I was modified.
+  Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
+    // If there are no uses to replace, then we return nullptr to indicate that
+    // no changes were made to the program.
+    if (I.use_empty())
+      return nullptr;
+
+    Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                      << "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
+    return &I;
+  }
+
+  /// Replace operand of instruction and add old operand to the worklist.
+  Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
+    Worklist.addValue(I.getOperand(OpNum));
+    I.setOperand(OpNum, V);
+    return &I;
+  }
+
+  /// Replace use and add the previously used value to the worklist.
+  void replaceUse(Use &U, Value *NewValue) {
+    Worklist.addValue(U);
+    U = NewValue;
+  }
+
+  /// Combiner aware instruction erasure.
+  ///
+  /// When dealing with an instruction that has side effects or produces a void
+  /// value, we can't rely on DCE to delete the instruction. Instead, visit
+  /// methods should return the value returned by this function.
+  virtual Instruction *eraseInstFromFunction(Instruction &I) = 0;
+
+  void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
+                        const Instruction *CxtI) const {
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+  }
+
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                    const APInt &DemandedMask, KnownBits &Known,
+                                    unsigned Depth = 0) = 0;
+  virtual Value *
+  SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
+                             unsigned Depth = 0,
+                             bool AllowMultipleUsers = false) = 0;
+};
+
+} // namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -322,6 +322,29 @@
   return TTIImpl->emitGetActiveLaneMask();
 }
 
+Optional<Instruction *>
+TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC,
+                                          IntrinsicInst &II) const {
+  return TTIImpl->instCombineIntrinsic(IC, II);
+}
+
+Optional<Value *> TargetTransformInfo::simplifyDemandedUseBitsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) const {
+  return TTIImpl->simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                   KnownBitsComputed);
+}
+
+Optional<Value *> TargetTransformInfo::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  return TTIImpl->simplifyDemandedVectorEltsIntrinsic(
+      IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+      SimplifyAndSetOp);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -618,6 +618,10 @@
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_TARGET_DATA
 
+bool Function::isTargetIntrinsic() const {
+  return IntID > TargetInfos[0].Count;
+}
+
 /// Find the segment of \c IntrinsicNameTable for intrinsics with the same
 /// target as \c Name, or the generic table if \c Name is not target specific.
 ///
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -0,0 +1,896 @@
+//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetTransformInfo.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+  unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
+Optional<Instruction *>
+GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::amdgcn_rcp: {
+    Value *Src = II.getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II.getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    if (II.isStrictFP())
+      break;
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      const APFloat &ArgVal = C->getValueAPF();
+      APFloat Val(ArgVal.getSemantics(), 1);
+      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+      // This is more precise than the instruction may give.
+      //
+      // TODO: The instruction always flushes denormal results (except for f16),
+      // should this also?
+      return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II.getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II.getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_frexp_mant:
+  case Intrinsic::amdgcn_frexp_exp: {
+    Value *Src = II.getArgOperand(0);
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      int Exp;
+      APFloat Significand =
+          frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
+
+      if (IID == Intrinsic::amdgcn_frexp_mant) {
+        return IC.replaceInstUsesWith(
+            II, ConstantFP::get(II.getContext(), Significand));
+      }
+
+      // Match instruction special case behavior.
+      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+        Exp = 0;
+
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+    }
+
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_class: {
+    enum {
+      S_NAN = 1 << 0,       // Signaling NaN
+      Q_NAN = 1 << 1,       // Quiet NaN
+      N_INFINITY = 1 << 2,  // Negative infinity
+      N_NORMAL = 1 << 3,    // Negative normal
+      N_SUBNORMAL = 1 << 4, // Negative subnormal
+      N_ZERO = 1 << 5,      // Negative zero
+      P_ZERO = 1 << 6,      // Positive zero
+      P_SUBNORMAL = 1 << 7, // Positive subnormal
+      P_NORMAL = 1 << 8,    // Positive normal
+      P_INFINITY = 1 << 9   // Positive infinity
+    };
+
+    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+                              N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
+                              P_NORMAL | P_INFINITY;
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+    if (!CMask) {
+      if (isa<UndefValue>(Src0)) {
+        return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+      }
+
+      if (isa<UndefValue>(Src1)) {
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), false));
+      }
+      break;
+    }
+
+    uint32_t Mask = CMask->getZExtValue();
+
+    // If all tests are made, it doesn't matter what the value is.
+    if ((Mask & FullMask) == FullMask) {
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
+    }
+
+    if ((Mask & FullMask) == 0) {
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
+    }
+
+    if (Mask == (S_NAN | Q_NAN)) {
+      // Equivalent of isnan. Replace with standard fcmp.
+      Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
+      FCmp->takeName(&II);
+      return IC.replaceInstUsesWith(II, FCmp);
+    }
+
+    if (Mask == (N_ZERO | P_ZERO)) {
+      // Equivalent of == 0.
+      Value *FCmp =
+          IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
+
+      FCmp->takeName(&II);
+      return IC.replaceInstUsesWith(II, FCmp);
+    }
+
+    // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
+    if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
+        isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
+      return IC.replaceOperand(
+          II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
+    }
+
+    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+    if (!CVal) {
+      if (isa<UndefValue>(Src0)) {
+        return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+      }
+
+      // Clamp mask to used bits
+      if ((Mask & FullMask) != Mask) {
+        CallInst *NewCall = IC.Builder.CreateCall(
+            II.getCalledFunction(),
+            {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
+
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+
+      break;
+    }
+
+    const APFloat &Val = CVal->getValueAPF();
+
+    bool Result =
+        ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+        ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+        ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+        ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+        ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+        ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+        ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+        ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+        ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+        ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+    return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
+  }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem =
+            II.getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded =
+            ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
+                                 ConstantFP::get(II.getContext(), Val1)});
+        return IC.replaceInstUsesWith(II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II.getArgOperand(0);
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+
+    unsigned Width;
+    Type *Ty = II.getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0) {
+        return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
+      }
+
+      // Hardware ignores high bits, so remove those.
+      if (Width >= IntSize) {
+        return IC.replaceOperand(
+            II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        return IC.replaceOperand(
+            II, 1,
+            ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
+      }
+    }
+
+    bool Signed = IID == Intrinsic::amdgcn_sbfe;
+
+    if (!CWidth || !COffset)
+      break;
+
+    // The case of Width == 0 is handled above, which makes this tranformation
+    // safe.  If Width == 0, then the ashr and lshr instructions become poison
+    // value since the shift amount would be equal to the bit size.
+    assert(Width != 0);
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
+      Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
+                                 : IC.Builder.CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(&II);
+      return IC.replaceInstUsesWith(II, RightShift);
+    }
+
+    Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
+                               : IC.Builder.CreateLShr(Src, Offset);
+
+    RightShift->takeName(&II);
+    return IC.replaceInstUsesWith(II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II.getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed) {
+      return &II;
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    Value *Src2 = II.getArgOperand(2);
+
+    // Checking for NaN before canonicalization provides better fidelity when
+    // mapping other operations onto fmed3 since the order of operands is
+    // unchanged.
+    CallInst *NewCall = nullptr;
+    if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
+      NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+    } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
+      NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+    } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
+      NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+    }
+
+    if (NewCall) {
+      NewCall->copyFastMathFlags(&II);
+      NewCall->takeName(&II);
+      return IC.replaceInstUsesWith(II, NewCall);
+    }
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II.setArgOperand(0, Src0);
+      II.setArgOperand(1, Src1);
+      II.setArgOperand(2, Src2);
+      return &II;
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return IC.replaceInstUsesWith(
+              II, ConstantFP::get(IC.Builder.getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        if (CCmp->isNullValue()) {
+          return IC.replaceInstUsesWith(
+              II, ConstantExpr::getSExt(CCmp, II.getType()));
+        }
+
+        // The result of V_ICMP/V_FCMP assembly instructions (which this
+        // intrinsic exposes) is one bit per thread, masked with the EXEC
+        // register (which contains the bitmask of live threads). So a
+        // comparison that always returns true is the same as a read of the
+        // EXEC register.
+        Function *NewF = Intrinsic::getDeclaration(
+            II.getModule(), Intrinsic::read_register, II.getType());
+        Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
+        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred =
+          CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II.setArgOperand(0, Src1);
+      II.setArgOperand(1, Src0);
+      II.setArgOperand(
+          2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
+      return &II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, PatternMatch::m_One()) &&
+          match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
+         (match(Src1, PatternMatch::m_AllOnes()) &&
+          match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
+      IC.replaceOperand(II, 2,
+                        ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return &II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, PatternMatch::m_Zero()) &&
+        match(Src0, PatternMatch::m_ZExtOrSExt(
+                        m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
+                              PatternMatch::m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
+                                 ? Intrinsic::amdgcn_fcmp
+                                 : Intrinsic::amdgcn_icmp;
+
+      Type *Ty = SrcLHS->getType();
+      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
+        // Promote to next legal integer type.
+        unsigned Width = CmpType->getBitWidth();
+        unsigned NewWidth = Width;
+
+        // Don't do anything for i1 comparisons.
+        if (Width == 1)
+          break;
+
+        if (Width <= 16)
+          NewWidth = 16;
+        else if (Width <= 32)
+          NewWidth = 32;
+        else if (Width <= 64)
+          NewWidth = 64;
+        else if (Width > 64)
+          break; // Can't handle this.
+
+        if (Width != NewWidth) {
+          IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
+          if (CmpInst::isSigned(SrcPred)) {
+            SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
+            SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
+          } else {
+            SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
+            SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
+          }
+        }
+      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
+        break;
+
+      Function *NewF = Intrinsic::getDeclaration(
+          II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
+      Value *Args[] = {SrcLHS, SrcRHS,
+                       ConstantInt::get(CC->getType(), SrcPred)};
+      CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+      NewCall->takeName(&II);
+      return IC.replaceInstUsesWith(II, NewCall);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ballot: {
+    if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+
+      if (Src->isOne()) {
+        // amdgcn.ballot(i1 1) is exec.
+        const char *RegName = "exec";
+        if (II.getType()->isIntegerTy(32))
+          RegName = "exec_lo";
+        else if (!II.getType()->isIntegerTy(64))
+          break;
+
+        Function *NewF = Intrinsic::getDeclaration(
+            II.getModule(), Intrinsic::read_register, II.getType());
+        Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
+        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+    }
+    break;
+  }
+  case Intrinsic::amdgcn_wqm_vote: {
+    // wqm_vote is identity when the argument is constant.
+    if (!isa<Constant>(II.getArgOperand(0)))
+      break;
+
+    return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+  }
+  case Intrinsic::amdgcn_kill: {
+    const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    if (!C || !C->getZExtValue())
+      break;
+
+    // amdgcn.kill(i1 1) is a no-op
+    return IC.eraseInstFromFunction(II);
+  }
+  case Intrinsic::amdgcn_update_dpp: {
+    Value *Old = II.getArgOperand(0);
+
+    auto *BC = cast<ConstantInt>(II.getArgOperand(5));
+    auto *RM = cast<ConstantInt>(II.getArgOperand(3));
+    auto *BM = cast<ConstantInt>(II.getArgOperand(4));
+    if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
+        BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
+      break;
+
+    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+    return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
+  }
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16: {
+    // Discard vdst_in if it's not going to be read.
+    Value *VDstIn = II.getArgOperand(0);
+    if (isa<UndefValue>(VDstIn))
+      break;
+
+    ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
+    ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
+    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
+      break;
+
+    return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
+  }
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane: {
+    // A constant value is trivially uniform.
+    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+      return IC.replaceInstUsesWith(II, C);
+    }
+
+    // The rest of these may not be safe if the exec may not be the same between
+    // the def and use.
+    Value *Src = II.getArgOperand(0);
+    Instruction *SrcInst = dyn_cast<Instruction>(Src);
+    if (SrcInst && SrcInst->getParent() != II.getParent())
+      break;
+
+    // readfirstlane (readfirstlane x) -> readfirstlane x
+    // readlane (readfirstlane x), y -> readfirstlane x
+    if (match(Src,
+              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+
+    if (IID == Intrinsic::amdgcn_readfirstlane) {
+      // readfirstlane (readlane x, y) -> readlane x, y
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+        return IC.replaceInstUsesWith(II, Src);
+      }
+    } else {
+      // readlane (readlane x, y), y -> readlane x, y
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+                         PatternMatch::m_Value(),
+                         PatternMatch::m_Specific(II.getArgOperand(1))))) {
+        return IC.replaceInstUsesWith(II, Src);
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ldexp: {
+    // FIXME: This doesn't introduce new instructions and belongs in
+    // InstructionSimplify.
+    Type *Ty = II.getType();
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+
+    // Folding undef to qnan is safe regardless of the FP mode.
+    if (isa<UndefValue>(Op0)) {
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    const APFloat *C = nullptr;
+    match(Op0, PatternMatch::m_APFloat(C));
+
+    // FIXME: Should flush denorms depending on FP mode, but that's ignored
+    // everywhere else.
+    //
+    // These cases should be safe, even with strictfp.
+    // ldexp(0.0, x) -> 0.0
+    // ldexp(-0.0, x) -> -0.0
+    // ldexp(inf, x) -> inf
+    // ldexp(-inf, x) -> -inf
+    if (C && (C->isZero() || C->isInfinity())) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // With strictfp, be more careful about possibly needing to flush denormals
+    // or not, and snan behavior depends on ieee_mode.
+    if (II.isStrictFP())
+      break;
+
+    if (C && C->isNaN()) {
+      // FIXME: We just need to make the nan quiet here, but that's unavailable
+      // on APFloat, only IEEEfloat
+      auto *Quieted =
+          ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
+      return IC.replaceInstUsesWith(II, Quieted);
+    }
+
+    // ldexp(x, 0) -> x
+    // ldexp(x, undef) -> x
+    if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    break;
+  }
+  }
+  return None;
+}
+
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+///
+/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
+///       struct returns.
+Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+                                             IntrinsicInst &II,
+                                             APInt DemandedElts,
+                                             int DMaskIdx = -1) {
+
+  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully
+  // supported.
+  if (DMaskIdx < 0 && II.getType()->getScalarSizeInBits() != 32 &&
+      DemandedElts.getActiveBits() == 3)
+    return nullptr;
+
+  auto *IIVTy = cast<VectorType>(II.getType());
+  unsigned VWidth = IIVTy->getNumElements();
+  if (VWidth == 1)
+    return nullptr;
+
+  IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.SetInsertPoint(&II);
+
+  // Assume the arguments are unchanged and later override them, if needed.
+  SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
+
+  if (DMaskIdx < 0) {
+    // Buffer case.
+
+    const unsigned ActiveBits = DemandedElts.getActiveBits();
+    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+
+    // Start assuming the prefix of elements is demanded, but possibly clear
+    // some other bits if there are trailing zeros (unused components at front)
+    // and update offset.
+    DemandedElts = (1 << ActiveBits) - 1;
+
+    if (UnusedComponentsAtFront > 0) {
+      static const unsigned InvalidOffsetIdx = 0xf;
+
+      unsigned OffsetIdx;
+      switch (II.getIntrinsicID()) {
+      case Intrinsic::amdgcn_raw_buffer_load:
+        OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_s_buffer_load:
+        // If resulting type is vec3, there is no point in trimming the
+        // load with updated offset, as the vec3 would most likely be widened to
+        // vec4 anyway during lowering.
+        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
+          OffsetIdx = InvalidOffsetIdx;
+        else
+          OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_struct_buffer_load:
+        OffsetIdx = 2;
+        break;
+      default:
+        // TODO: handle tbuffer* intrinsics.
+        OffsetIdx = InvalidOffsetIdx;
+        break;
+      }
+
+      if (OffsetIdx != InvalidOffsetIdx) {
+        // Clear demanded bits and update the offset.
+        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
+        auto *Offset = II.getArgOperand(OffsetIdx);
+        unsigned SingleComponentSizeInBits =
+            IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
+        unsigned OffsetAdd =
+            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
+        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+        Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+      }
+    }
+  } else {
+    // Image case.
+
+    ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
+    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+    // Mask off values that are undefined because the dmask doesn't cover them
+    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+    unsigned NewDMaskVal = 0;
+    unsigned OrigLoadIdx = 0;
+    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+      const unsigned Bit = 1 << SrcIdx;
+      if (!!(DMaskVal & Bit)) {
+        if (!!DemandedElts[OrigLoadIdx])
+          NewDMaskVal |= Bit;
+        OrigLoadIdx++;
+      }
+    }
+
+    if (DMaskVal != NewDMaskVal)
+      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
+  }
+
+  unsigned NewNumElts = DemandedElts.countPopulation();
+  if (!NewNumElts)
+    return UndefValue::get(II.getType());
+
+  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+    if (DMaskIdx >= 0)
+      II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
+    return nullptr;
+  }
+
+  // Validate function argument and return types, extracting overloaded types
+  // along the way.
+  SmallVector<Type *, 6> OverloadTys;
+  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
+    return nullptr;
+
+  Module *M = II.getParent()->getParent()->getParent();
+  Type *EltTy = IIVTy->getElementType();
+  Type *NewTy =
+      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
+
+  OverloadTys[0] = NewTy;
+  Function *NewIntrin =
+      Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
+
+  CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
+  NewCall->takeName(&II);
+  NewCall->copyMetadata(II);
+
+  if (NewNumElts == 1) {
+    return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
+                                          NewCall,
+                                          DemandedElts.countTrailingZeros());
+  }
+
+  SmallVector<int, 8> EltMask;
+  unsigned NewLoadIdx = 0;
+  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+    if (!!DemandedElts[OrigLoadIdx])
+      EltMask.push_back(NewLoadIdx++);
+    else
+      EltMask.push_back(NewNumElts);
+  }
+
+  Value *Shuffle =
+      IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
+
+  return Shuffle;
+}
+
+Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  switch (II.getIntrinsicID()) {
+  case Intrinsic::amdgcn_buffer_load:
+  case Intrinsic::amdgcn_buffer_load_format:
+  case Intrinsic::amdgcn_raw_buffer_load:
+  case Intrinsic::amdgcn_raw_buffer_load_format:
+  case Intrinsic::amdgcn_raw_tbuffer_load:
+  case Intrinsic::amdgcn_s_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load_format:
+  case Intrinsic::amdgcn_struct_tbuffer_load:
+  case Intrinsic::amdgcn_tbuffer_load:
+    return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
+  default: {
+    if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
+      return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
+    }
+    break;
+  }
+  }
+  return None;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -33,6 +33,7 @@
 namespace llvm {
 
 class AMDGPUTargetLowering;
+class InstCombiner;
 class Loop;
 class ScalarEvolution;
 class Type;
@@ -223,6 +224,14 @@
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const;
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
   unsigned getVectorSplitCost() { return 0; }
 
   unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -34,6 +34,10 @@
 
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
+set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
+tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
+add_public_tablegen_target(InstCombineTableGen)
+
 add_llvm_target(AMDGPUCodeGen
   AMDGPUAliasAnalysis.cpp
   AMDGPUAlwaysInlinePass.cpp
@@ -48,6 +52,7 @@
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUHSAMetadataStreamer.cpp
+  AMDGPUInstCombineIntrinsic.cpp
   AMDGPUInstrInfo.cpp
   AMDGPUInstructionSelector.cpp
   AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td
rename from llvm/lib/Transforms/InstCombine/InstCombineTables.td
rename to llvm/lib/Target/AMDGPU/InstCombineTables.td
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -113,6 +113,9 @@
     return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -28,6 +28,8 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -50,6 +52,28 @@
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
 
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
+                               InstCombiner::BuilderTy &Builder) {
+  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+  if (!IntrAlign)
+    return nullptr;
+
+  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
+                           ? MemAlign
+                           : IntrAlign->getLimitedValue();
+
+  if (!isPowerOf2_32(Alignment))
+    return nullptr;
+
+  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::get(II.getType(), 0));
+  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
+}
+
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -82,6 +106,114 @@
   return false;
 }
 
+Optional<Instruction *>
+ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::arm_neon_vld1: {
+    Align MemAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+  }
+
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Align MemAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    unsigned AlignArg = II.getNumArgOperands() - 1;
+    Value *AlignArgOp = II.getArgOperand(AlignArg);
+    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
+    if (Align && *Align < MemAlign) {
+      return IC.replaceOperand(
+          II, AlignArg,
+          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
+                           false));
+    }
+    break;
+  }
+
+  case Intrinsic::arm_mve_pred_i2v: {
+    Value *Arg = II.getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+                       PatternMatch::m_Value(ArgArg))) &&
+        II.getType() == ArgArg->getType()) {
+      return IC.replaceInstUsesWith(II, ArgArg);
+    }
+    Constant *XorMask;
+    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+                             PatternMatch::m_Value(ArgArg)),
+                         PatternMatch::m_Constant(XorMask))) &&
+        II.getType() == ArgArg->getType()) {
+      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+        if (CI->getValue().trunc(16).isAllOnesValue()) {
+          auto TrueVector = IC.Builder.CreateVectorSplat(
+              cast<VectorType>(II.getType())->getNumElements(),
+              IC.Builder.getTrue());
+          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+        }
+      }
+    }
+    KnownBits ScalarKnown(32);
+    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
+                                ScalarKnown, 0)) {
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_pred_v2i: {
+    Value *Arg = II.getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
+                       PatternMatch::m_Value(ArgArg)))) {
+      return IC.replaceInstUsesWith(II, ArgArg);
+    }
+    if (!II.getMetadata(LLVMContext::MD_range)) {
+      Type *IntTy32 = Type::getInt32Ty(II.getContext());
+      Metadata *M[] = {
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
+      II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_vadc:
+  case Intrinsic::arm_mve_vadc_predicated: {
+    unsigned CarryOp =
+        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
+    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
+           "Bad type for intrinsic!");
+
+    KnownBits CarryKnown(32);
+    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
+                                CarryKnown)) {
+      return &II;
+    }
+    break;
+  }
+  }
+  return None;
+}
+
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                               TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -48,6 +48,9 @@
     return AddressSpace::ADDRESS_SPACE_GENERIC;
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   // Loads and stores can be vectorized if the alignment is at least as big as
   // the load/store we want to vectorize.
   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -111,6 +111,263 @@
   return false;
 }
 
+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+  // Each NVVM intrinsic we can simplify can be replaced with one of:
+  //
+  //  * an LLVM intrinsic,
+  //  * an LLVM cast operation,
+  //  * an LLVM binary operation, or
+  //  * ad-hoc LLVM IR for the particular operation.
+
+  // Some transformations are only valid when the module's
+  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+  // transformations are valid regardless of the module's ftz setting.
+  enum FtzRequirementTy {
+    FTZ_Any,       // Any ftz setting is ok.
+    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
+    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+  };
+  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+  // simplify.
+  enum SpecialCase {
+    SPC_Reciprocal,
+  };
+
+  // SimplifyAction is a poor-man's variant (plus an additional flag) that
+  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+  struct SimplifyAction {
+    // Invariant: At most one of these Optionals has a value.
+    Optional<Intrinsic::ID> IID;
+    Optional<Instruction::CastOps> CastOp;
+    Optional<Instruction::BinaryOps> BinaryOp;
+    Optional<SpecialCase> Special;
+
+    FtzRequirementTy FtzRequirement = FTZ_Any;
+
+    SimplifyAction() = default;
+
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+        : IID(IID), FtzRequirement(FtzReq) {}
+
+    // Cast operations don't have anything to do with FTZ, so we skip that
+    // argument.
+    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+        : Special(Special), FtzRequirement(FtzReq) {}
+  };
+
+  // Try to generate a SimplifyAction describing how to replace our
+  // IntrinsicInstr with target-generic LLVM IR.
+  const SimplifyAction Action = [II]() -> SimplifyAction {
+    switch (II->getIntrinsicID()) {
+    // NVVM intrinsics that map directly to LLVM intrinsics.
+    case Intrinsic::nvvm_ceil_d:
+      return {Intrinsic::ceil, FTZ_Any};
+    case Intrinsic::nvvm_ceil_f:
+      return {Intrinsic::ceil, FTZ_MustBeOff};
+    case Intrinsic::nvvm_ceil_ftz_f:
+      return {Intrinsic::ceil, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fabs_d:
+      return {Intrinsic::fabs, FTZ_Any};
+    case Intrinsic::nvvm_fabs_f:
+      return {Intrinsic::fabs, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fabs_ftz_f:
+      return {Intrinsic::fabs, FTZ_MustBeOn};
+    case Intrinsic::nvvm_floor_d:
+      return {Intrinsic::floor, FTZ_Any};
+    case Intrinsic::nvvm_floor_f:
+      return {Intrinsic::floor, FTZ_MustBeOff};
+    case Intrinsic::nvvm_floor_ftz_f:
+      return {Intrinsic::floor, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_d:
+      return {Intrinsic::fma, FTZ_Any};
+    case Intrinsic::nvvm_fma_rn_f:
+      return {Intrinsic::fma, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fma_rn_ftz_f:
+      return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_d:
+      return {Intrinsic::maxnum, FTZ_Any};
+    case Intrinsic::nvvm_fmax_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_d:
+      return {Intrinsic::minnum, FTZ_Any};
+    case Intrinsic::nvvm_fmin_f:
+      return {Intrinsic::minnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_f:
+      return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_round_d:
+      return {Intrinsic::round, FTZ_Any};
+    case Intrinsic::nvvm_round_f:
+      return {Intrinsic::round, FTZ_MustBeOff};
+    case Intrinsic::nvvm_round_ftz_f:
+      return {Intrinsic::round, FTZ_MustBeOn};
+    case Intrinsic::nvvm_sqrt_rn_d:
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_f:
+      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
+      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
+      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
+      // the versions with explicit ftz-ness.
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_rn_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOff};
+    case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOn};
+    case Intrinsic::nvvm_trunc_d:
+      return {Intrinsic::trunc, FTZ_Any};
+    case Intrinsic::nvvm_trunc_f:
+      return {Intrinsic::trunc, FTZ_MustBeOff};
+    case Intrinsic::nvvm_trunc_ftz_f:
+      return {Intrinsic::trunc, FTZ_MustBeOn};
+
+    // NVVM intrinsics that map to LLVM cast operations.
+    //
+    // Note that llvm's target-generic conversion operators correspond to the rz
+    // (round to zero) versions of the nvvm conversion intrinsics, even though
+    // most everything else here uses the rn (round to nearest even) nvvm ops.
+    case Intrinsic::nvvm_d2i_rz:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_d2ll_rz:
+    case Intrinsic::nvvm_f2ll_rz:
+      return {Instruction::FPToSI};
+    case Intrinsic::nvvm_d2ui_rz:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_d2ull_rz:
+    case Intrinsic::nvvm_f2ull_rz:
+      return {Instruction::FPToUI};
+    case Intrinsic::nvvm_i2d_rz:
+    case Intrinsic::nvvm_i2f_rz:
+    case Intrinsic::nvvm_ll2d_rz:
+    case Intrinsic::nvvm_ll2f_rz:
+      return {Instruction::SIToFP};
+    case Intrinsic::nvvm_ui2d_rz:
+    case Intrinsic::nvvm_ui2f_rz:
+    case Intrinsic::nvvm_ull2d_rz:
+    case Intrinsic::nvvm_ull2f_rz:
+      return {Instruction::UIToFP};
+
+    // NVVM intrinsics that map to LLVM binary ops.
+    case Intrinsic::nvvm_add_rn_d:
+      return {Instruction::FAdd, FTZ_Any};
+    case Intrinsic::nvvm_add_rn_f:
+      return {Instruction::FAdd, FTZ_MustBeOff};
+    case Intrinsic::nvvm_add_rn_ftz_f:
+      return {Instruction::FAdd, FTZ_MustBeOn};
+    case Intrinsic::nvvm_mul_rn_d:
+      return {Instruction::FMul, FTZ_Any};
+    case Intrinsic::nvvm_mul_rn_f:
+      return {Instruction::FMul, FTZ_MustBeOff};
+    case Intrinsic::nvvm_mul_rn_ftz_f:
+      return {Instruction::FMul, FTZ_MustBeOn};
+    case Intrinsic::nvvm_div_rn_d:
+      return {Instruction::FDiv, FTZ_Any};
+    case Intrinsic::nvvm_div_rn_f:
+      return {Instruction::FDiv, FTZ_MustBeOff};
+    case Intrinsic::nvvm_div_rn_ftz_f:
+      return {Instruction::FDiv, FTZ_MustBeOn};
+
+    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+    // need special handling.
+    //
+    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
+    // as well.
+    case Intrinsic::nvvm_rcp_rn_d:
+      return {SPC_Reciprocal, FTZ_Any};
+    case Intrinsic::nvvm_rcp_rn_f:
+      return {SPC_Reciprocal, FTZ_MustBeOff};
+    case Intrinsic::nvvm_rcp_rn_ftz_f:
+      return {SPC_Reciprocal, FTZ_MustBeOn};
+
+      // We do not currently simplify intrinsics that give an approximate
+      // answer. These include:
+      //
+      //   - nvvm_cos_approx_{f,ftz_f}
+      //   - nvvm_ex2_approx_{d,f,ftz_f}
+      //   - nvvm_lg2_approx_{d,f,ftz_f}
+      //   - nvvm_sin_approx_{f,ftz_f}
+      //   - nvvm_sqrt_approx_{f,ftz_f}
+      //   - nvvm_rsqrt_approx_{d,f,ftz_f}
+      //   - nvvm_div_approx_{ftz_d,ftz_f,f}
+      //   - nvvm_rcp_approx_ftz_d
+      //
+      // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+      // means that fastmath is enabled in the intrinsic.  Unfortunately only
+      // binary operators (currently) have a fastmath bit in SelectionDAG, so
+      // this information gets lost and we can't select on it.
+      //
+      // TODO: div and rcp are lowered to a binary op, so these we could in
+      // theory lower them to "fast fdiv".
+
+    default:
+      return {};
+    }
+  }();
+
+  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+  // can bail out now.  (Notice that in the case that IID is not an NVVM
+  // intrinsic, we don't have to look up any module metadata, as
+  // FtzRequirementTy will be FTZ_Any.)
+  if (Action.FtzRequirement != FTZ_Any) {
+    StringRef Attr = II->getFunction()
+                         ->getFnAttribute("denormal-fp-math-f32")
+                         .getValueAsString();
+    DenormalMode Mode = parseDenormalFPAttribute(Attr);
+    bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
+
+    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+      return nullptr;
+  }
+
+  // Simplify to target-generic intrinsic.
+  if (Action.IID) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    // All the target-generic intrinsics currently of interest to us have one
+    // type argument, equal to that of the nvvm intrinsic's argument.
+    Type *Tys[] = {II->getArgOperand(0)->getType()};
+    return CallInst::Create(
+        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+  }
+
+  // Simplify to target-generic binary op.
+  if (Action.BinaryOp)
+    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+                                  II->getArgOperand(1), II->getName());
+
+  // Simplify to target-generic cast op.
+  if (Action.CastOp)
+    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+                            II->getName());
+
+  // All that's left are the special cases.
+  if (!Action.Special)
+    return nullptr;
+
+  switch (*Action.Special) {
+  case SPC_Reciprocal:
+    // Simplify reciprocal.
+    return BinaryOperator::Create(
+        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+        II->getArgOperand(0), II->getName());
+  }
+  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
+Optional<Instruction *>
+NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
+    return I;
+  }
+  return None;
+}
+
 int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueKind Opd1Info,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -41,6 +41,9 @@
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -13,8 +13,11 @@
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "ppctti"
@@ -59,6 +62,158 @@
   return TTI::PSK_Software;
 }
 
+Optional<Instruction *>
+PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+    // Turn PPC lvx -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Value *Ptr = IC.Builder.CreateBitCast(
+          II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+      return new LoadInst(II.getType(), Ptr, "", false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::getUnqual(II.getType()));
+    return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+    return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
+  }
+  case Intrinsic::ppc_qpx_qvlfs:
+    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *VTy =
+          VectorType::get(IC.Builder.getFloatTy(),
+                          cast<VectorType>(II.getType())->getElementCount());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
+                                            PointerType::getUnqual(VTy));
+      Value *Load = IC.Builder.CreateLoad(VTy, Ptr);
+      return new FPExtInst(Load, II.getType());
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvlfd:
+    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(32), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) {
+      Value *Ptr = IC.Builder.CreateBitCast(
+          II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+      return new LoadInst(II.getType(), Ptr, "", false, Align(32));
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfs:
+    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *VTy = VectorType::get(
+          IC.Builder.getFloatTy(),
+          cast<VectorType>(II.getArgOperand(0)->getType())->getElementCount());
+      Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy);
+      Type *OpPtrTy = PointerType::getUnqual(VTy);
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(TOp, Ptr, false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfd:
+    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(32), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) {
+      Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32));
+    }
+    break;
+
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+    // a vectorshuffle for little endian, we must undo the transformation
+    // performed on vec_perm in altivec.h.  That is, we must complement
+    // the permutation mask with respect to 31 and reverse the order of
+    // V1 and V2.
+    if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
+      assert(cast<VectorType>(Mask->getType())->getNumElements() == 16 &&
+             "Bad type for intrinsic!");
+
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        Constant *Elt = Mask->getAggregateElement(i);
+        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 =
+            IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
+        Value *Op1 =
+            IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
+        Value *Result = UndefValue::get(Op0->getType());
+
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getAggregateElement(i)))
+            continue;
+          unsigned Idx =
+              cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
+          Idx &= 31; // Match the hardware behavior.
+          if (DL.isLittleEndian())
+            Idx = 31 - Idx;
+
+          if (!ExtractedElts[Idx]) {
+            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
+            ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
+                Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
+          }
+
+          // Insert this value into the result vector.
+          Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
+                                                  IC.Builder.getInt32(i));
+        }
+        return CastInst::Create(Instruction::BitCast, Result, II.getType());
+      }
+    }
+    break;
+  }
+  return None;
+}
+
 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                               TTI::TargetCostKind CostKind) {
   if (DisablePPCConstHoist)
@@ -849,7 +1004,7 @@
       // The cost of the load constant for a vector extract is disregarded
       // (invariant, easily schedulable).
       return vectorCostAdjustment(1, Opcode, Val, nullptr);
-      
+
     } else if (ST->hasDirectMove())
       // Assume permute has standard cost.
       // Assume move-to/move-from VSR have 2x standard cost.
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -47,6 +47,7 @@
   X86IndirectThunks.cpp
   X86InterleavedAccess.cpp
   X86InsertPrefetch.cpp
+  X86InstCombineIntrinsic.cpp
   X86InstrFMA3Info.cpp
   X86InstrFoldTables.cpp
   X86InstrInfo.cpp
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -0,0 +1,2007 @@
+//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
+  SmallVector<Constant *, 32> BoolVec;
+  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
+  for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
+    Constant *Elt = V->getElementAsConstant(I);
+    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
+           "Unexpected constant data vector element type");
+    bool Sign = V->getElementType()->isIntegerTy()
+                    ? cast<ConstantInt>(Elt)->isNegative()
+                    : cast<ConstantFP>(Elt)->isNegative();
+    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
+  }
+  return ConstantVector::get(BoolVec);
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+  // Special case a zero mask since that's not a ConstantDataVector.
+  // This masked load instruction creates a zero vector.
+  if (isa<ConstantAggregateZero>(Mask))
+    return IC.replaceInstUsesWith(II, ZeroVec);
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return nullptr;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  // The pass-through vector for an x86 masked load is a zero vector.
+  CallInst *NewMaskedLoad =
+      IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
+  return IC.replaceInstUsesWith(II, NewMaskedLoad);
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Value *Vec = II.getOperand(2);
+
+  // Special case a zero mask since that's not a ConstantDataVector:
+  // this masked store instruction does nothing.
+  if (isa<ConstantAggregateZero>(Mask)) {
+    IC.eraseInstFromFunction(II);
+    return true;
+  }
+
+  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+  // anything else at this level.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+    return false;
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return false;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
+
+  // 'Replace uses' doesn't work for stores. Erase the original masked store.
+  IC.eraseInstFromFunction(II);
+  return true;
+}
+
+static Value *simplifyX86immShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+  bool IsImm = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unexpected intrinsic!");
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psrai_q_256:
+  case Intrinsic::x86_avx512_psrai_d_512:
+  case Intrinsic::x86_avx512_psrai_q_512:
+  case Intrinsic::x86_avx512_psrai_w_512:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx512_psrli_d_512:
+  case Intrinsic::x86_avx512_psrli_q_512:
+  case Intrinsic::x86_avx512_psrli_w_512:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx512_pslli_d_512:
+  case Intrinsic::x86_avx512_pslli_q_512:
+  case Intrinsic::x86_avx512_pslli_w_512:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  auto Vec = II.getArgOperand(0);
+  auto Amt = II.getArgOperand(1);
+  auto VT = cast<VectorType>(Vec->getType());
+  auto SVT = VT->getElementType();
+  auto AmtVT = Amt->getType();
+  unsigned VWidth = VT->getNumElements();
+  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+  // If the shift amount is guaranteed to be in-range we can replace it with a
+  // generic shift. If its guaranteed to be out of range, logical shifts combine
+  // to zero and arithmetic shifts are clamped to (BitWidth - 1).
+  if (IsImm) {
+    assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
+    KnownBits KnownAmtBits =
+        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+      Amt = Builder.CreateVectorSplat(VWidth, Amt);
+      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                        : Builder.CreateLShr(Vec, Amt))
+                           : Builder.CreateAShr(Vec, Amt));
+    }
+    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
+      if (LogicalShift)
+        return ConstantAggregateZero::get(VT);
+      Amt = ConstantInt::get(SVT, BitWidth - 1);
+      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
+    }
+  } else {
+    // Ensure the first element has an in-range value and the rest of the
+    // elements in the bottom 64 bits are zero.
+    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+           cast<VectorType>(AmtVT)->getElementType() == SVT &&
+           "Unexpected shift-by-scalar type");
+    unsigned NumAmtElts = cast<VectorType>(AmtVT)->getNumElements();
+    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
+    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
+    KnownBits KnownLowerBits = llvm::computeKnownBits(
+        Amt, DemandedLower, II.getModule()->getDataLayout());
+    KnownBits KnownUpperBits = llvm::computeKnownBits(
+        Amt, DemandedUpper, II.getModule()->getDataLayout());
+    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
+        (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+      SmallVector<int, 16> ZeroSplat(VWidth, 0);
+      Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat);
+      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                        : Builder.CreateLShr(Vec, Amt))
+                           : Builder.CreateAShr(Vec, Amt));
+    }
+  }
+
+  // Simplify if count is constant vector.
+  auto CDV = dyn_cast<ConstantDataVector>(Amt);
+  if (!CDV)
+    return nullptr;
+
+  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+  // operand to compute the shift amount.
+  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+         cast<VectorType>(AmtVT)->getElementType() == SVT &&
+         "Unexpected shift-by-scalar type");
+
+  // Concatenate the sub-elements to create the 64-bit value.
+  APInt Count(64, 0);
+  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
+    unsigned SubEltIdx = (NumSubElts - 1) - i;
+    auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+    Count <<= BitWidth;
+    Count |= SubElt->getValue().zextOrTrunc(64);
+  }
+
+  // If shift-by-zero then just return the original value.
+  if (Count.isNullValue())
+    return Vec;
+
+  // Handle cases when Shift >= BitWidth.
+  if (Count.uge(BitWidth)) {
+    // If LogicalShift - just return zero.
+    if (LogicalShift)
+      return ConstantAggregateZero::get(VT);
+
+    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+    Count = APInt(64, BitWidth - 1);
+  }
+
+  // Get a constant vector of the same type as the first operand.
+  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unexpected intrinsic!");
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx512_psrav_q_128:
+  case Intrinsic::x86_avx512_psrav_q_256:
+  case Intrinsic::x86_avx512_psrav_d_512:
+  case Intrinsic::x86_avx512_psrav_q_512:
+  case Intrinsic::x86_avx512_psrav_w_128:
+  case Intrinsic::x86_avx512_psrav_w_256:
+  case Intrinsic::x86_avx512_psrav_w_512:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx512_psrlv_d_512:
+  case Intrinsic::x86_avx512_psrlv_q_512:
+  case Intrinsic::x86_avx512_psrlv_w_128:
+  case Intrinsic::x86_avx512_psrlv_w_256:
+  case Intrinsic::x86_avx512_psrlv_w_512:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx512_psllv_d_512:
+  case Intrinsic::x86_avx512_psllv_q_512:
+  case Intrinsic::x86_avx512_psllv_w_128:
+  case Intrinsic::x86_avx512_psllv_w_256:
+  case Intrinsic::x86_avx512_psllv_w_512:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  auto Vec = II.getArgOperand(0);
+  auto Amt = II.getArgOperand(1);
+  auto VT = cast<VectorType>(II.getType());
+  auto SVT = VT->getElementType();
+  int NumElts = VT->getNumElements();
+  int BitWidth = SVT->getIntegerBitWidth();
+
+  // If the shift amount is guaranteed to be in-range we can replace it with a
+  // generic shift.
+  APInt UpperBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
+  if (llvm::MaskedValueIsZero(Amt, UpperBits,
+                              II.getModule()->getDataLayout())) {
+    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                      : Builder.CreateLShr(Vec, Amt))
+                         : Builder.CreateAShr(Vec, Amt));
+  }
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(Amt);
+  if (!CShift)
+    return nullptr;
+
+  // Collect each element's shift amount.
+  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+  bool AnyOutOfRange = false;
+  SmallVector<int, 8> ShiftAmts;
+  for (int I = 0; I < NumElts; ++I) {
+    auto *CElt = CShift->getAggregateElement(I);
+    if (CElt && isa<UndefValue>(CElt)) {
+      ShiftAmts.push_back(-1);
+      continue;
+    }
+
+    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+    if (!COp)
+      return nullptr;
+
+    // Handle out of range shifts.
+    // If LogicalShift - set to BitWidth (special case).
+    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+    APInt ShiftVal = COp->getValue();
+    if (ShiftVal.uge(BitWidth)) {
+      AnyOutOfRange = LogicalShift;
+      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+      continue;
+    }
+
+    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+  }
+
+  // If all elements out of range or UNDEF, return vector of zeros/undefs.
+  // ArithmeticShift should only hit this if they are all UNDEF.
+  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+  if (llvm::all_of(ShiftAmts, OutOfRange)) {
+    SmallVector<Constant *, 8> ConstantVec;
+    for (int Idx : ShiftAmts) {
+      if (Idx < 0) {
+        ConstantVec.push_back(UndefValue::get(SVT));
+      } else {
+        assert(LogicalShift && "Logical shift expected");
+        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+      }
+    }
+    return ConstantVector::get(ConstantVec);
+  }
+
+  // We can't handle only some out of range values with generic logical shifts.
+  if (AnyOutOfRange)
+    return nullptr;
+
+  // Build the shift amount constant vector.
+  SmallVector<Constant *, 8> ShiftVecAmts;
+  for (int Idx : ShiftAmts) {
+    if (Idx < 0)
+      ShiftVecAmts.push_back(UndefValue::get(SVT));
+    else
+      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+  }
+  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II,
+                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+
+  // Fast all undef handling.
+  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+    return UndefValue::get(ResTy);
+
+  auto *ArgTy = cast<VectorType>(Arg0->getType());
+  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+  unsigned NumSrcElts = ArgTy->getNumElements();
+  assert(cast<VectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
+         "Unexpected packing types");
+
+  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
+  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
+         "Unexpected packing types");
+
+  // Constant folding.
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  // Clamp Values - signed/unsigned both use signed clamp values, but they
+  // differ on the min/max values.
+  APInt MinValue, MaxValue;
+  if (IsSigned) {
+    // PACKSS: Truncate signed value with signed saturation.
+    // Source values less than dst minint are saturated to minint.
+    // Source values greater than dst maxint are saturated to maxint.
+    MinValue =
+        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+    MaxValue =
+        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+  } else {
+    // PACKUS: Truncate signed value with unsigned saturation.
+    // Source values less than zero are saturated to zero.
+    // Source values greater than dst maxuint are saturated to maxuint.
+    MinValue = APInt::getNullValue(SrcScalarSizeInBits);
+    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
+  }
+
+  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
+  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
+  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
+  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
+  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
+  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
+
+  // Shuffle clamped args together at the lane level.
+  SmallVector<int, 32> PackMask;
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
+    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
+  }
+  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
+
+  // Truncate to dst size.
+  return Builder.CreateTrunc(Shuffle, ResTy);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  Value *Arg = II.getArgOperand(0);
+  Type *ResTy = II.getType();
+
+  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+  if (isa<UndefValue>(Arg))
+    return Constant::getNullValue(ResTy);
+
+  auto *ArgTy = dyn_cast<VectorType>(Arg->getType());
+  // We can't easily peek through x86_mmx types.
+  if (!ArgTy)
+    return nullptr;
+
+  // Expand MOVMSK to compare/bitcast/zext:
+  // e.g. PMOVMSKB(v16i8 x):
+  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
+  // %int = bitcast <16 x i1> %cmp to i16
+  // %res = zext i16 %int to i32
+  unsigned NumElts = ArgTy->getNumElements();
+  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
+  Type *IntegerTy = Builder.getIntNTy(NumElts);
+
+  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
+  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+  Res = Builder.CreateBitCast(Res, IntegerTy);
+  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
+  return Res;
+}
+
+static Value *simplifyX86addcarry(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  Value *CarryIn = II.getArgOperand(0);
+  Value *Op1 = II.getArgOperand(1);
+  Value *Op2 = II.getArgOperand(2);
+  Type *RetTy = II.getType();
+  Type *OpTy = Op1->getType();
+  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
+         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
+         "Unexpected types for x86 addcarry");
+
+  // If carry-in is zero, this is just an unsigned add with overflow.
+  if (match(CarryIn, PatternMatch::m_ZeroInt())) {
+    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
+                                          {Op1, Op2});
+    // The types have to be adjusted to match the x86 call types.
+    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
+    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
+                                       Builder.getInt8Ty());
+    Value *Res = UndefValue::get(RetTy);
+    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
+    return Builder.CreateInsertValue(Res, UAddResult, 1);
+  }
+
+  return nullptr;
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  if (!CInt)
+    return nullptr;
+
+  VectorType *VecTy = cast<VectorType>(II.getType());
+  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+  // The immediate permute control byte looks like this:
+  //    [3:0] - zero mask for each 32-bit lane
+  //    [5:4] - select one 32-bit destination lane
+  //    [7:6] - select one 32-bit source lane
+
+  uint8_t Imm = CInt->getZExtValue();
+  uint8_t ZMask = Imm & 0xf;
+  uint8_t DestLane = (Imm >> 4) & 0x3;
+  uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+  // If all zero mask bits are set, this was just a weird way to
+  // generate a zero vector.
+  if (ZMask == 0xf)
+    return ZeroVector;
+
+  // Initialize by passing all of the first source bits through.
+  int ShuffleMask[4] = {0, 1, 2, 3};
+
+  // We may replace the second operand with the zero vector.
+  Value *V1 = II.getArgOperand(1);
+
+  if (ZMask) {
+    // If the zero mask is being used with a single input or the zero mask
+    // overrides the destination lane, this is a shuffle with the zero vector.
+    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+        (ZMask & (1 << DestLane))) {
+      V1 = ZeroVector;
+      // We may still move 32-bits of the first source vector from one lane
+      // to another.
+      ShuffleMask[DestLane] = SourceLane;
+      // The zero mask may override the previous insert operation.
+      for (unsigned i = 0; i < 4; ++i)
+        if ((ZMask >> i) & 0x1)
+          ShuffleMask[i] = i + 4;
+    } else {
+      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+      return nullptr;
+    }
+  } else {
+    // Replace the selected destination lane with the selected source lane.
+    ShuffleMask[DestLane] = SourceLane + 4;
+  }
+
+  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+}
+
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
+                               ConstantInt *CILength, ConstantInt *CIIndex,
+                               InstCombiner::BuilderTy &Builder) {
+  auto LowConstantHighUndef = [&](uint64_t Val) {
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  };
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  ConstantInt *CI0 =
+      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Attempt to constant fold.
+  if (CILength && CIIndex) {
+    // From AMD documentation: "The bit index and field length are each six
+    // bits in length other bits of the field are ignored."
+    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+    APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+    unsigned Index = APIndex.getZExtValue();
+
+    // From AMD documentation: "a value of zero in the field length is
+    // defined as length of 64".
+    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+    // From AMD documentation: "If the sum of the bit index + length field
+    // is greater than 64, the results are undefined".
+    unsigned End = Index + Length;
+
+    // Note that both field index and field length are 8-bit quantities.
+    // Since variables 'Index' and 'Length' are unsigned values
+    // obtained from zero-extending field index and field length
+    // respectively, their sum should never wrap around.
+    if (End > 64)
+      return UndefValue::get(II.getType());
+
+    // If we are inserting whole bytes, we can convert this to a shuffle.
+    // Lowering can recognize EXTRQI shuffle masks.
+    if ((Length % 8) == 0 && (Index % 8) == 0) {
+      // Convert bit indices to byte indices.
+      Length /= 8;
+      Index /= 8;
+
+      Type *IntTy8 = Type::getInt8Ty(II.getContext());
+      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+      SmallVector<int, 16> ShuffleMask;
+      for (int i = 0; i != (int)Length; ++i)
+        ShuffleMask.push_back(i + Index);
+      for (int i = Length; i != 8; ++i)
+        ShuffleMask.push_back(i + 16);
+      for (int i = 8; i != 16; ++i)
+        ShuffleMask.push_back(-1);
+
+      Value *SV = Builder.CreateShuffleVector(
+          Builder.CreateBitCast(Op0, ShufTy),
+          ConstantAggregateZero::get(ShufTy), ShuffleMask);
+      return Builder.CreateBitCast(SV, II.getType());
+    }
+
+    // Constant Fold - shift Index'th bit to lowest position and mask off
+    // Length bits.
+    if (CI0) {
+      APInt Elt = CI0->getValue();
+      Elt.lshrInPlace(Index);
+      Elt = Elt.zextOrTrunc(Length);
+      return LowConstantHighUndef(Elt.getZExtValue());
+    }
+
+    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+      Value *Args[] = {Op0, CILength, CIIndex};
+      Module *M = II.getModule();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+      return Builder.CreateCall(F, Args);
+    }
+  }
+
+  // Constant Fold - extraction from zero is always {zero, undef}.
+  if (CI0 && CI0->isZero())
+    return LowConstantHighUndef(0);
+
+  return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+                                 APInt APLength, APInt APIndex,
+                                 InstCombiner::BuilderTy &Builder) {
+  // From AMD documentation: "The bit index and field length are each six bits
+  // in length other bits of the field are ignored."
+  APIndex = APIndex.zextOrTrunc(6);
+  APLength = APLength.zextOrTrunc(6);
+
+  // Attempt to constant fold.
+  unsigned Index = APIndex.getZExtValue();
+
+  // From AMD documentation: "a value of zero in the field length is
+  // defined as length of 64".
+  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+  // From AMD documentation: "If the sum of the bit index + length field
+  // is greater than 64, the results are undefined".
+  unsigned End = Index + Length;
+
+  // Note that both field index and field length are 8-bit quantities.
+  // Since variables 'Index' and 'Length' are unsigned values
+  // obtained from zero-extending field index and field length
+  // respectively, their sum should never wrap around.
+  if (End > 64)
+    return UndefValue::get(II.getType());
+
+  // If we are inserting whole bytes, we can convert this to a shuffle.
+  // Lowering can recognize INSERTQI shuffle masks.
+  if ((Length % 8) == 0 && (Index % 8) == 0) {
+    // Convert bit indices to byte indices.
+    Length /= 8;
+    Index /= 8;
+
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+    SmallVector<int, 16> ShuffleMask;
+    for (int i = 0; i != (int)Index; ++i)
+      ShuffleMask.push_back(i);
+    for (int i = 0; i != (int)Length; ++i)
+      ShuffleMask.push_back(i + 16);
+    for (int i = Index + Length; i != 8; ++i)
+      ShuffleMask.push_back(i);
+    for (int i = 8; i != 16; ++i)
+      ShuffleMask.push_back(-1);
+
+    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+                                            Builder.CreateBitCast(Op1, ShufTy),
+                                            ShuffleMask);
+    return Builder.CreateBitCast(SV, II.getType());
+  }
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  Constant *C1 = dyn_cast<Constant>(Op1);
+  ConstantInt *CI00 =
+      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+  ConstantInt *CI10 =
+      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+  if (CI00 && CI10) {
+    APInt V00 = CI00->getValue();
+    APInt V10 = CI10->getValue();
+    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+    V00 = V00 & ~Mask;
+    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+    APInt Val = V00 | V10;
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  }
+
+  // If we were an INSERTQ call, we'll save demanded elements if we convert to
+  // INSERTQI.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+    Value *Args[] = {Op0, Op1, CILength, CIIndex};
+    Module *M = II.getModule();
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+    return Builder.CreateCall(F, Args);
+  }
+
+  return nullptr;
+}
+
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+         "Unexpected number of elements in shuffle mask!");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  int Indexes[64];
+
+  // Each byte in the shuffle control mask forms an index to permute the
+  // corresponding byte in the destination operand.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
+
+    // If the most significant bit (bit[7]) of each byte of the shuffle
+    // control mask is set, then zero is written in the result byte.
+    // The zero vector is in the right-hand side of the resulting
+    // shufflevector.
+
+    // The value of each index for the high 128-bit lane is the least
+    // significant 4 bits of the respective shuffle control byte.
+    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+    Indexes[I] = Index;
+  }
+
+  auto V1 = II.getArgOperand(0);
+  auto V2 = Constant::getNullValue(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+                                    InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+  bool IsPD = VecTy->getScalarType()->isDoubleTy();
+  unsigned NumLaneElts = IsPD ? 2 : 4;
+  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  int Indexes[16];
+
+  // The intrinsics only read one or two bits, clear the rest.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    APInt Index = cast<ConstantInt>(COp)->getValue();
+    Index = Index.zextOrTrunc(32).getLoBits(2);
+
+    // The PD variants uses bit 1 to select per-lane element index, so
+    // shift down to convert to generic shuffle mask index.
+    if (IsPD)
+      Index.lshrInPlace(1);
+
+    // The _256 variants are a bit trickier since the mask bits always index
+    // into the corresponding 128 half. In order to convert to a generic
+    // shuffle, we have to make that explicit.
+    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
+
+    Indexes[I] = Index.getZExtValue();
+  }
+
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(V1->getType());
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  unsigned Size = VecTy->getNumElements();
+  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+         "Unexpected shuffle mask size");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  int Indexes[64];
+
+  for (unsigned I = 0; I < Size; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    Index &= Size - 1;
+    Indexes[I] = Index;
+  }
+
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size));
+}
+
+Optional<Instruction *>
+X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
+                                             unsigned DemandedWidth) {
+    APInt UndefElts(Width, 0);
+    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+    return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+  };
+
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::x86_bmi_bextr_32:
+  case Intrinsic::x86_bmi_bextr_64:
+  case Intrinsic::x86_tbm_bextri_u32:
+  case Intrinsic::x86_tbm_bextri_u64:
+    // If the RHS is a constant we can try some simplifications.
+    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      uint64_t Shift = C->getZExtValue();
+      uint64_t Length = (Shift >> 8) & 0xff;
+      Shift &= 0xff;
+      unsigned BitWidth = II.getType()->getIntegerBitWidth();
+      // If the length is 0 or the shift is out of range, replace with zero.
+      if (Length == 0 || Shift >= BitWidth) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      // If the LHS is also a constant, we can completely constant fold this.
+      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Result = InC->getZExtValue() >> Shift;
+        if (Length > BitWidth)
+          Length = BitWidth;
+        Result &= maskTrailingOnes<uint64_t>(Length);
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
+      // are only masking bits that a shift already cleared?
+    }
+    break;
+
+  case Intrinsic::x86_bmi_bzhi_32:
+  case Intrinsic::x86_bmi_bzhi_64:
+    // If the RHS is a constant we can try some simplifications.
+    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      uint64_t Index = C->getZExtValue() & 0xff;
+      unsigned BitWidth = II.getType()->getIntegerBitWidth();
+      if (Index >= BitWidth) {
+        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+      }
+      if (Index == 0) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      // If the LHS is also a constant, we can completely constant fold this.
+      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Result = InC->getZExtValue();
+        Result &= maskTrailingOnes<uint64_t>(Index);
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+      // TODO should we convert this to an AND if the RHS is constant?
+    }
+    break;
+  case Intrinsic::x86_bmi_pext_32:
+  case Intrinsic::x86_bmi_pext_64:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      if (MaskC->isNullValue()) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      if (MaskC->isAllOnesValue()) {
+        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Src = SrcC->getZExtValue();
+        uint64_t Mask = MaskC->getZExtValue();
+        uint64_t Result = 0;
+        uint64_t BitToSet = 1;
+
+        while (Mask) {
+          // Isolate lowest set bit.
+          uint64_t BitToTest = Mask & -Mask;
+          if (BitToTest & Src)
+            Result |= BitToSet;
+
+          BitToSet <<= 1;
+          // Clear lowest set bit.
+          Mask &= Mask - 1;
+        }
+
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+    }
+    break;
+  case Intrinsic::x86_bmi_pdep_32:
+  case Intrinsic::x86_bmi_pdep_64:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      if (MaskC->isNullValue()) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      if (MaskC->isAllOnesValue()) {
+        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Src = SrcC->getZExtValue();
+        uint64_t Mask = MaskC->getZExtValue();
+        uint64_t Result = 0;
+        uint64_t BitToTest = 1;
+
+        while (Mask) {
+          // Isolate lowest set bit.
+          uint64_t BitToSet = Mask & -Mask;
+          if (BitToTest & Src)
+            Result |= BitToSet;
+
+          BitToTest <<= 1;
+          // Clear lowest set bit;
+          Mask &= Mask - 1;
+        }
+
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+    }
+    break;
+
+  case Intrinsic::x86_sse_cvtss2si:
+  case Intrinsic::x86_sse_cvtss2si64:
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvtsd2si:
+  case Intrinsic::x86_sse2_cvtsd2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64:
+  case Intrinsic::x86_avx512_vcvtss2si32:
+  case Intrinsic::x86_avx512_vcvtss2si64:
+  case Intrinsic::x86_avx512_vcvtss2usi32:
+  case Intrinsic::x86_avx512_vcvtss2usi64:
+  case Intrinsic::x86_avx512_vcvtsd2si32:
+  case Intrinsic::x86_avx512_vcvtsd2si64:
+  case Intrinsic::x86_avx512_vcvtsd2usi32:
+  case Intrinsic::x86_avx512_vcvtsd2usi64:
+  case Intrinsic::x86_avx512_cvttss2si:
+  case Intrinsic::x86_avx512_cvttss2si64:
+  case Intrinsic::x86_avx512_cvttss2usi:
+  case Intrinsic::x86_avx512_cvttss2usi64:
+  case Intrinsic::x86_avx512_cvttsd2si:
+  case Intrinsic::x86_avx512_cvttsd2si64:
+  case Intrinsic::x86_avx512_cvttsd2usi:
+  case Intrinsic::x86_avx512_cvttsd2usi64: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    Value *Arg = II.getArgOperand(0);
+    unsigned VWidth = cast<VectorType>(Arg->getType())->getNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
+      return IC.replaceOperand(II, 0, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_mmx_pmovmskb:
+  case Intrinsic::x86_sse_movmsk_ps:
+  case Intrinsic::x86_sse2_movmsk_pd:
+  case Intrinsic::x86_sse2_pmovmskb_128:
+  case Intrinsic::x86_avx_movmsk_pd_256:
+  case Intrinsic::x86_avx_movmsk_ps_256:
+  case Intrinsic::x86_avx2_pmovmskb:
+    if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd:
+  case Intrinsic::x86_avx512_vcomi_ss:
+  case Intrinsic::x86_avx512_vcomi_sd:
+  case Intrinsic::x86_avx512_mask_cmp_ss:
+  case Intrinsic::x86_avx512_mask_cmp_sd: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    bool MadeChange = false;
+    Value *Arg0 = II.getArgOperand(0);
+    Value *Arg1 = II.getArgOperand(1);
+    unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+      IC.replaceOperand(II, 0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      IC.replaceOperand(II, 1, V);
+      MadeChange = true;
+    }
+    if (MadeChange) {
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::x86_avx512_cmp_pd_128:
+  case Intrinsic::x86_avx512_cmp_pd_256:
+  case Intrinsic::x86_avx512_cmp_pd_512:
+  case Intrinsic::x86_avx512_cmp_ps_128:
+  case Intrinsic::x86_avx512_cmp_ps_256:
+  case Intrinsic::x86_avx512_cmp_ps_512: {
+    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
+    Value *Arg0 = II.getArgOperand(0);
+    Value *Arg1 = II.getArgOperand(1);
+    bool Arg0IsZero = match(Arg0, PatternMatch::m_PosZeroFP());
+    if (Arg0IsZero)
+      std::swap(Arg0, Arg1);
+    Value *A, *B;
+    // This fold requires only the NINF(not +/- inf) since inf minus
+    // inf is nan.
+    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
+    // equal for both compares.
+    // NNAN is not needed because nans compare the same for both compares.
+    // The compare intrinsic uses the above assumptions and therefore
+    // doesn't require additional flags.
+    if ((match(Arg0,
+               PatternMatch::m_OneUse(PatternMatch::m_FSub(
+                   PatternMatch::m_Value(A), PatternMatch::m_Value(B)))) &&
+         match(Arg1, PatternMatch::m_PosZeroFP()) && isa<Instruction>(Arg0) &&
+         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
+      if (Arg0IsZero)
+        std::swap(A, B);
+      IC.replaceOperand(II, 0, A);
+      IC.replaceOperand(II, 1, B);
+      return &II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx512_add_ps_512:
+  case Intrinsic::x86_avx512_div_ps_512:
+  case Intrinsic::x86_avx512_mul_ps_512:
+  case Intrinsic::x86_avx512_sub_ps_512:
+  case Intrinsic::x86_avx512_add_pd_512:
+  case Intrinsic::x86_avx512_div_pd_512:
+  case Intrinsic::x86_avx512_mul_pd_512:
+  case Intrinsic::x86_avx512_sub_pd_512:
+    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+    // IR operations.
+    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+      if (R->getValue() == 4) {
+        Value *Arg0 = II.getArgOperand(0);
+        Value *Arg1 = II.getArgOperand(1);
+
+        Value *V;
+        switch (IID) {
+        default:
+          llvm_unreachable("Case stmts out of sync!");
+        case Intrinsic::x86_avx512_add_ps_512:
+        case Intrinsic::x86_avx512_add_pd_512:
+          V = IC.Builder.CreateFAdd(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_sub_ps_512:
+        case Intrinsic::x86_avx512_sub_pd_512:
+          V = IC.Builder.CreateFSub(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_mul_ps_512:
+        case Intrinsic::x86_avx512_mul_pd_512:
+          V = IC.Builder.CreateFMul(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_div_ps_512:
+        case Intrinsic::x86_avx512_div_pd_512:
+          V = IC.Builder.CreateFDiv(Arg0, Arg1);
+          break;
+        }
+
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+    break;
+
+  case Intrinsic::x86_avx512_mask_add_ss_round:
+  case Intrinsic::x86_avx512_mask_div_ss_round:
+  case Intrinsic::x86_avx512_mask_mul_ss_round:
+  case Intrinsic::x86_avx512_mask_sub_ss_round:
+  case Intrinsic::x86_avx512_mask_add_sd_round:
+  case Intrinsic::x86_avx512_mask_div_sd_round:
+  case Intrinsic::x86_avx512_mask_mul_sd_round:
+  case Intrinsic::x86_avx512_mask_sub_sd_round:
+    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+    // IR operations.
+    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
+      if (R->getValue() == 4) {
+        // Extract the element as scalars.
+        Value *Arg0 = II.getArgOperand(0);
+        Value *Arg1 = II.getArgOperand(1);
+        Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
+        Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+        Value *V;
+        switch (IID) {
+        default:
+          llvm_unreachable("Case stmts out of sync!");
+        case Intrinsic::x86_avx512_mask_add_ss_round:
+        case Intrinsic::x86_avx512_mask_add_sd_round:
+          V = IC.Builder.CreateFAdd(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_sub_ss_round:
+        case Intrinsic::x86_avx512_mask_sub_sd_round:
+          V = IC.Builder.CreateFSub(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_mul_ss_round:
+        case Intrinsic::x86_avx512_mask_mul_sd_round:
+          V = IC.Builder.CreateFMul(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_div_ss_round:
+        case Intrinsic::x86_avx512_mask_div_sd_round:
+          V = IC.Builder.CreateFDiv(LHS, RHS);
+          break;
+        }
+
+        // Handle the masking aspect of the intrinsic.
+        Value *Mask = II.getArgOperand(3);
+        auto *C = dyn_cast<ConstantInt>(Mask);
+        // We don't need a select if we know the mask bit is a 1.
+        if (!C || !C->getValue()[0]) {
+          // Cast the mask to an i1 vector and then extract the lowest element.
+          auto *MaskTy = FixedVectorType::get(
+              IC.Builder.getInt1Ty(),
+              cast<IntegerType>(Mask->getType())->getBitWidth());
+          Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
+          Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
+          // Extract the lowest element from the passthru operand.
+          Value *Passthru =
+              IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
+          V = IC.Builder.CreateSelect(Mask, V, Passthru);
+        }
+
+        // Insert the result back into the original argument 0.
+        V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
+
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+    break;
+
+  // Constant fold ashr( <A x Bi>, Ci ).
+  // Constant fold lshr( <A x Bi>, Ci ).
+  // Constant fold shl( <A x Bi>, Ci ).
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psrai_q_256:
+  case Intrinsic::x86_avx512_psrai_d_512:
+  case Intrinsic::x86_avx512_psrai_q_512:
+  case Intrinsic::x86_avx512_psrai_w_512:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx512_psrli_d_512:
+  case Intrinsic::x86_avx512_psrli_q_512:
+  case Intrinsic::x86_avx512_psrli_w_512:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx512_pslli_d_512:
+  case Intrinsic::x86_avx512_pslli_q_512:
+  case Intrinsic::x86_avx512_pslli_w_512:
+    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512: {
+    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+
+    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+    // operand to compute the shift amount.
+    Value *Arg1 = II.getArgOperand(1);
+    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+           "Unexpected packed shift size");
+    unsigned VWidth = cast<VectorType>(Arg1->getType())->getNumElements();
+
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+      return IC.replaceOperand(II, 1, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx512_psllv_d_512:
+  case Intrinsic::x86_avx512_psllv_q_512:
+  case Intrinsic::x86_avx512_psllv_w_128:
+  case Intrinsic::x86_avx512_psllv_w_256:
+  case Intrinsic::x86_avx512_psllv_w_512:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx512_psrav_q_128:
+  case Intrinsic::x86_avx512_psrav_q_256:
+  case Intrinsic::x86_avx512_psrav_d_512:
+  case Intrinsic::x86_avx512_psrav_q_512:
+  case Intrinsic::x86_avx512_psrav_w_128:
+  case Intrinsic::x86_avx512_psrav_w_256:
+  case Intrinsic::x86_avx512_psrav_w_512:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx512_psrlv_d_512:
+  case Intrinsic::x86_avx512_psrlv_q_512:
+  case Intrinsic::x86_avx512_psrlv_w_128:
+  case Intrinsic::x86_avx512_psrlv_w_256:
+  case Intrinsic::x86_avx512_psrlv_w_512:
+    if (Value *V = simplifyX86varShift(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+    if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512:
+    if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_pclmulqdq:
+  case Intrinsic::x86_pclmulqdq_256:
+  case Intrinsic::x86_pclmulqdq_512: {
+    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+      unsigned Imm = C->getZExtValue();
+
+      bool MadeChange = false;
+      Value *Arg0 = II.getArgOperand(0);
+      Value *Arg1 = II.getArgOperand(1);
+      unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
+
+      APInt UndefElts1(VWidth, 0);
+      APInt DemandedElts1 =
+          APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
+      if (Value *V =
+              IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
+        IC.replaceOperand(II, 0, V);
+        MadeChange = true;
+      }
+
+      APInt UndefElts2(VWidth, 0);
+      APInt DemandedElts2 =
+          APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
+      if (Value *V =
+              IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
+        IC.replaceOperand(II, 1, V);
+        MadeChange = true;
+      }
+
+      // If either input elements are undef, the result is zero.
+      if (DemandedElts1.isSubsetOf(UndefElts1) ||
+          DemandedElts2.isSubsetOf(UndefElts2)) {
+        return IC.replaceInstUsesWith(II,
+                                      ConstantAggregateZero::get(II.getType()));
+      }
+
+      if (MadeChange) {
+        return &II;
+      }
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_insertps:
+    if (Value *V = simplifyX86insertps(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse4a_extrq: {
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
+    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 16 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CILength =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+           : nullptr;
+    ConstantInt *CIIndex =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+
+    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+    // operands and the lowest 16-bits of the second.
+    bool MadeChange = false;
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      IC.replaceOperand(II, 0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+      IC.replaceOperand(II, 1, V);
+      MadeChange = true;
+    }
+    if (MadeChange) {
+      return &II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_extrqi: {
+    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+    // bits of the lower 64-bits. The upper 64-bits are undefined.
+    Value *Op0 = II.getArgOperand(0);
+    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+
+    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      return IC.replaceOperand(II, 0, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertq: {
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           cast<VectorType>(Op1->getType())->getNumElements() == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CI11 =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+    if (CI11) {
+      const APInt &V11 = CI11->getValue();
+      APInt Len = V11.zextOrTrunc(6);
+      APInt Idx = V11.lshr(8).zextOrTrunc(6);
+      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+
+    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      return IC.replaceOperand(II, 0, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertqi: {
+    // INSERTQI: Extract lowest Length bits from lower half of second source and
+    // insert over first source starting at Index bit. The upper 64-bits are
+    // undefined.
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
+    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 2 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (CILength && CIIndex) {
+      APInt Len = CILength->getValue().zextOrTrunc(6);
+      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+
+    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+    // operands.
+    bool MadeChange = false;
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      IC.replaceOperand(II, 0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+      IC.replaceOperand(II, 1, V);
+      MadeChange = true;
+    }
+    if (MadeChange) {
+      return &II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_pblendvb:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_avx_blendv_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx2_pblendvb: {
+    // fold (blend A, A, Mask) -> A
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    Value *Mask = II.getArgOperand(2);
+    if (Op0 == Op1) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // Zero Mask - select 1st argument.
+    if (isa<ConstantAggregateZero>(Mask)) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
+    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
+    }
+
+    // Convert to a vector select if we can bypass casts and find a boolean
+    // vector condition value.
+    Value *BoolVec;
+    Mask = InstCombiner::peekThroughBitcast(Mask);
+    if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
+        BoolVec->getType()->isVectorTy() &&
+        BoolVec->getType()->getScalarSizeInBits() == 1) {
+      assert(Mask->getType()->getPrimitiveSizeInBits() ==
+                 II.getType()->getPrimitiveSizeInBits() &&
+             "Not expecting mask and operands with different sizes");
+
+      unsigned NumMaskElts =
+          cast<VectorType>(Mask->getType())->getNumElements();
+      unsigned NumOperandElts =
+          cast<VectorType>(II.getType())->getNumElements();
+      if (NumMaskElts == NumOperandElts) {
+        return SelectInst::Create(BoolVec, Op1, Op0);
+      }
+
+      // If the mask has less elements than the operands, each mask bit maps to
+      // multiple elements of the operands. Bitcast back and forth.
+      if (NumMaskElts < NumOperandElts) {
+        Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
+        Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+        Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
+        return new BitCastInst(Sel, II.getType());
+      }
+    }
+
+    break;
+  }
+
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+  case Intrinsic::x86_avx512_pshuf_b_512:
+    if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx512_vpermilvar_ps_512:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256:
+  case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+  case Intrinsic::x86_avx512_permvar_df_256:
+  case Intrinsic::x86_avx512_permvar_df_512:
+  case Intrinsic::x86_avx512_permvar_di_256:
+  case Intrinsic::x86_avx512_permvar_di_512:
+  case Intrinsic::x86_avx512_permvar_hi_128:
+  case Intrinsic::x86_avx512_permvar_hi_256:
+  case Intrinsic::x86_avx512_permvar_hi_512:
+  case Intrinsic::x86_avx512_permvar_qi_128:
+  case Intrinsic::x86_avx512_permvar_qi_256:
+  case Intrinsic::x86_avx512_permvar_qi_512:
+  case Intrinsic::x86_avx512_permvar_sf_512:
+  case Intrinsic::x86_avx512_permvar_si_512:
+    if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx_maskload_ps:
+  case Intrinsic::x86_avx_maskload_pd:
+  case Intrinsic::x86_avx_maskload_ps_256:
+  case Intrinsic::x86_avx_maskload_pd_256:
+  case Intrinsic::x86_avx2_maskload_d:
+  case Intrinsic::x86_avx2_maskload_q:
+  case Intrinsic::x86_avx2_maskload_d_256:
+  case Intrinsic::x86_avx2_maskload_q_256:
+    if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
+      return I;
+    }
+    break;
+
+  case Intrinsic::x86_sse2_maskmov_dqu:
+  case Intrinsic::x86_avx_maskstore_ps:
+  case Intrinsic::x86_avx_maskstore_pd:
+  case Intrinsic::x86_avx_maskstore_ps_256:
+  case Intrinsic::x86_avx_maskstore_pd_256:
+  case Intrinsic::x86_avx2_maskstore_d:
+  case Intrinsic::x86_avx2_maskstore_q:
+  case Intrinsic::x86_avx2_maskstore_d_256:
+  case Intrinsic::x86_avx2_maskstore_q_256:
+    if (simplifyX86MaskedStore(II, IC)) {
+      return nullptr;
+    }
+    break;
+
+  case Intrinsic::x86_addcarry_32:
+  case Intrinsic::x86_addcarry_64:
+    if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  default:
+    break;
+  }
+  return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) const {
+  switch (II.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::x86_mmx_pmovmskb:
+  case Intrinsic::x86_sse_movmsk_ps:
+  case Intrinsic::x86_sse2_movmsk_pd:
+  case Intrinsic::x86_sse2_pmovmskb_128:
+  case Intrinsic::x86_avx_movmsk_ps_256:
+  case Intrinsic::x86_avx_movmsk_pd_256:
+  case Intrinsic::x86_avx2_pmovmskb: {
+    // MOVMSK copies the vector elements' sign bits to the low bits
+    // and zeros the high bits.
+    unsigned ArgWidth;
+    if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+      ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+    } else {
+      auto Arg = II.getArgOperand(0);
+      auto ArgType = cast<VectorType>(Arg->getType());
+      ArgWidth = ArgType->getNumElements();
+    }
+
+    // If we don't need any of low bits then return zero,
+    // we know that DemandedMask is non-zero already.
+    APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+    Type *VTy = II.getType();
+    if (DemandedElts.isNullValue()) {
+      return ConstantInt::getNullValue(VTy);
+    }
+
+    // We know that the upper bits are set to zero.
+    Known.Zero.setBitsFrom(ArgWidth);
+    KnownBitsComputed = true;
+    break;
+  }
+  case Intrinsic::x86_sse42_crc32_64_64:
+    Known.Zero.setBitsFrom(32);
+    KnownBitsComputed = true;
+    break;
+  }
+  return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        simplifyAndSetOp) const {
+  unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
+  switch (II.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::x86_xop_vfrcz_ss:
+  case Intrinsic::x86_xop_vfrcz_sd:
+    // The instructions for these intrinsics are speced to zero upper bits not
+    // pass them through like other scalar intrinsics. So we shouldn't just
+    // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+    // Instead we should return a zero vector.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return ConstantAggregateZero::get(II.getType());
+    }
+
+    // Only the lower element is used.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // Only the lower element is undefined. The high elements are zero.
+    UndefElts = UndefElts[0];
+    break;
+
+  // Unary scalar-as-vector operations that work column-wise.
+  case Intrinsic::x86_sse_rcp_ss:
+  case Intrinsic::x86_sse_rsqrt_ss:
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+    // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+    // checks).
+    break;
+
+  // Binary scalar-as-vector operations that work column-wise. The high
+  // elements come from operand 0. The low element is a function of both
+  // operands.
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse2_min_sd:
+  case Intrinsic::x86_sse2_max_sd:
+  case Intrinsic::x86_sse2_cmp_sd: {
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+
+    // Only lower element is used for operand 1.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+    // Lower element is undefined if both lower elements are undefined.
+    // Consider things like undef&0.  The result is known zero, not undef.
+    if (!UndefElts2[0])
+      UndefElts.clearBit(0);
+
+    break;
+  }
+
+  // Binary scalar-as-vector operations that work column-wise. The high
+  // elements come from operand 0 and the low element comes from operand 1.
+  case Intrinsic::x86_sse41_round_ss:
+  case Intrinsic::x86_sse41_round_sd: {
+    // Don't use the low element of operand 0.
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clearBit(0);
+    simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+
+    // Only lower element is used for operand 1.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+    // Take the high undef elements from operand 0 and take the lower element
+    // from operand 1.
+    UndefElts.clearBit(0);
+    UndefElts |= UndefElts2[0];
+    break;
+  }
+
+  // Three input scalar-as-vector operations that work column-wise. The high
+  // elements come from operand 0 and the low element is a function of all
+  // three inputs.
+  case Intrinsic::x86_avx512_mask_add_ss_round:
+  case Intrinsic::x86_avx512_mask_div_ss_round:
+  case Intrinsic::x86_avx512_mask_mul_ss_round:
+  case Intrinsic::x86_avx512_mask_sub_ss_round:
+  case Intrinsic::x86_avx512_mask_max_ss_round:
+  case Intrinsic::x86_avx512_mask_min_ss_round:
+  case Intrinsic::x86_avx512_mask_add_sd_round:
+  case Intrinsic::x86_avx512_mask_div_sd_round:
+  case Intrinsic::x86_avx512_mask_mul_sd_round:
+  case Intrinsic::x86_avx512_mask_sub_sd_round:
+  case Intrinsic::x86_avx512_mask_max_sd_round:
+  case Intrinsic::x86_avx512_mask_min_sd_round:
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+
+    // Only lower element is used for operand 1 and 2.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+    simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
+
+    // Lower element is undefined if all three lower elements are undefined.
+    // Consider things like undef&0.  The result is known zero, not undef.
+    if (!UndefElts2[0] || !UndefElts3[0])
+      UndefElts.clearBit(0);
+
+    break;
+
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512: {
+    auto *Ty0 = II.getArgOperand(0)->getType();
+    unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
+    assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+    unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+    unsigned VWidthPerLane = VWidth / NumLanes;
+    unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+    // Per lane, pack the elements of the first input and then the second.
+    // e.g.
+    // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+    // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+    for (int OpNum = 0; OpNum != 2; ++OpNum) {
+      APInt OpDemandedElts(InnerVWidth, 0);
+      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+        unsigned LaneIdx = Lane * VWidthPerLane;
+        for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+          unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+          if (DemandedElts[Idx])
+            OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+        }
+      }
+
+      // Demand elements from the operand.
+      APInt OpUndefElts(InnerVWidth, 0);
+      simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
+
+      // Pack the operand's UNDEF elements, one lane at a time.
+      OpUndefElts = OpUndefElts.zext(VWidth);
+      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+        APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+        LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+        LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+        UndefElts |= LaneElts;
+      }
+    }
+    break;
+  }
+
+  // PSHUFB
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+  case Intrinsic::x86_avx512_pshuf_b_512:
+  // PERMILVAR
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx512_vpermilvar_ps_512:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256:
+  case Intrinsic::x86_avx512_vpermilvar_pd_512:
+  // PERMV
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps: {
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
+    break;
+  }
+
+  // SSE4A instructions leave the upper 64-bits of the 128-bit result
+  // in an undefined state.
+  case Intrinsic::x86_sse4a_extrq:
+  case Intrinsic::x86_sse4a_extrqi:
+  case Intrinsic::x86_sse4a_insertq:
+  case Intrinsic::x86_sse4a_insertqi:
+    UndefElts.setHighBits(VWidth / 2);
+    break;
+  }
+  return None;
+}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -22,6 +22,8 @@
 
 namespace llvm {
 
+class InstCombiner;
+
 class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   typedef BasicTTIImplBase<X86TTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
@@ -151,6 +153,18 @@
   int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                 const SCEV *Ptr);
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
   unsigned getAtomicMemIntrinsicMaxElementSize() const;
 
   int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
diff --git a/llvm/lib/Transforms/InstCombine/CMakeLists.txt b/llvm/lib/Transforms/InstCombine/CMakeLists.txt
--- a/llvm/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/llvm/lib/Transforms/InstCombine/CMakeLists.txt
@@ -1,7 +1,3 @@
-set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
-tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
-add_public_tablegen_target(InstCombineTableGen)
-
 add_llvm_component_library(LLVMInstCombine
   InstructionCombining.cpp
   InstCombineAddSub.cpp
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>
 
@@ -860,7 +861,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
+Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
   Constant *Op1C;
   if (!match(Op1, m_Constant(Op1C)))
@@ -886,15 +887,15 @@
   // zext(bool) + C -> bool ? C + 1 : C
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, AddOne(Op1C), Op1);
+    return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
   // sext(bool) + C -> bool ? C - 1 : C
   if (match(Op0, m_SExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, SubOne(Op1C), Op1);
+    return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X))))
-    return BinaryOperator::CreateSub(SubOne(Op1C), X);
+    return BinaryOperator::CreateSub(InstCombiner::SubOne(Op1C), X);
 
   const APInt *C;
   if (!match(Op1, m_APInt(C)))
@@ -1021,7 +1022,7 @@
 
 // Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
 // does not overflow.
-Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
+Value *InstCombinerImpl::SimplifyAddWithRemainder(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   Value *X, *MulOpV;
   APInt C0, MulOpC;
@@ -1097,9 +1098,9 @@
   return nullptr;
 }
 
-Instruction *
-InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
-    BinaryOperator &I) {
+Instruction *InstCombinerImpl::
+    canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+        BinaryOperator &I) {
   assert((I.getOpcode() == Instruction::Add ||
           I.getOpcode() == Instruction::Or ||
           I.getOpcode() == Instruction::Sub) &&
@@ -1198,7 +1199,7 @@
   return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
 }
 
-Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  SQ.getWithInstruction(&I)))
@@ -1486,7 +1487,7 @@
                 : BinaryOperator::CreateFDivFMF(XY, Z, &I);
 }
 
-Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -1600,8 +1601,8 @@
 /// Optimize pointer differences into the same array into a size.  Consider:
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
-Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               Type *Ty, bool IsNUW) {
+Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
+                                                   Type *Ty, bool IsNUW) {
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
@@ -1692,7 +1693,7 @@
   return Builder.CreateIntCast(Result, Ty, true);
 }
 
-Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  SQ.getWithInstruction(&I)))
@@ -1806,14 +1807,14 @@
     Value *X;
     if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
       // C - (zext bool) --> bool ? C - 1 : C
-      return SelectInst::Create(X, SubOne(C), C);
+      return SelectInst::Create(X, InstCombiner::SubOne(C), C);
     if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
       // C - (sext bool) --> bool ? C + 1 : C
-      return SelectInst::Create(X, AddOne(C), C);
+      return SelectInst::Create(X, InstCombiner::AddOne(C), C);
 
     // C - ~X == X + (1+C)
     if (match(Op1, m_Not(m_Value(X))))
-      return BinaryOperator::CreateAdd(X, AddOne(C));
+      return BinaryOperator::CreateAdd(X, InstCombiner::AddOne(C));
 
     // Try to fold constant sub into select arguments.
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
@@ -2094,11 +2095,11 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
+Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
   Value *Op = I.getOperand(0);
 
   if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
+                                  getSimplifyQuery().getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *X = foldFNegIntoConstant(I))
@@ -2117,10 +2118,10 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
   if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
+                                  getSimplifyQuery().getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *X = foldVectorBinop(I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -13,10 +13,12 @@
 #include "InstCombineInternal.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -114,10 +116,9 @@
 
 /// This handles expressions of the form ((val OP C1) & C2).  Where
 /// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
-Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
-                                    ConstantInt *OpRHS,
-                                    ConstantInt *AndRHS,
-                                    BinaryOperator &TheAnd) {
+Instruction *InstCombinerImpl::OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
+                                        ConstantInt *AndRHS,
+                                        BinaryOperator &TheAnd) {
   Value *X = Op->getOperand(0);
 
   switch (Op->getOpcode()) {
@@ -161,8 +162,9 @@
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
 /// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
 /// whether to treat V, Lo, and Hi as signed or not.
-Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
-                                     bool isSigned, bool Inside) {
+Value *InstCombinerImpl::insertRangeTest(Value *V, const APInt &Lo,
+                                         const APInt &Hi, bool isSigned,
+                                         bool Inside) {
   assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
          "Lo is not < Hi in range emission code!");
 
@@ -437,11 +439,10 @@
 /// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
 /// and the right hand side is of type BMask_Mixed. For example,
 /// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
-static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-    Value *A, Value *B, Value *C, Value *D, Value *E,
-    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
-    llvm::InstCombiner::BuilderTy &Builder) {
+static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
+    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    InstCombiner::BuilderTy &Builder) {
   // We are given the canonical form:
   //   (icmp ne (A & B), 0) & (icmp eq (A & D), E).
   // where D & E == E.
@@ -568,11 +569,9 @@
 /// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
 /// aren't of the common mask pattern type.
 static Value *foldLogOpOfMaskedICmpsAsymmetric(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-    Value *A, Value *B, Value *C, Value *D, Value *E,
-    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
-    unsigned LHSMask, unsigned RHSMask,
-    llvm::InstCombiner::BuilderTy &Builder) {
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
+    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) {
   assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
          "Expected equality predicates for masked type of icmps.");
   // Handle Mask_NotAllZeros-BMask_Mixed cases.
@@ -603,7 +602,7 @@
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-                                     llvm::InstCombiner::BuilderTy &Builder) {
+                                     InstCombiner::BuilderTy &Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   Optional<std::pair<unsigned, unsigned>> MaskPair =
@@ -748,8 +747,8 @@
 /// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
 /// If \p Inverted is true then the check is for the inverted range, e.g.
 /// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
-Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
-                                        bool Inverted) {
+Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                            bool Inverted) {
   // Check the lower range comparison, e.g. x >= 0
   // InstCombine already ensured that if there is a constant it's on the RHS.
   ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
@@ -856,8 +855,9 @@
 
 // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
 // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
-Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                                   BinaryOperator &Logic) {
+Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
+                                                       ICmpInst *RHS,
+                                                       BinaryOperator &Logic) {
   bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
   assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
          "Wrong opcode");
@@ -1184,8 +1184,8 @@
 }
 
 /// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    BinaryOperator &And) {
+Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                        BinaryOperator &And) {
   const SimplifyQuery Q = SQ.getWithInstruction(&And);
 
   // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
@@ -1404,7 +1404,8 @@
   return nullptr;
 }
 
-Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) {
+Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
+                                          bool IsAnd) {
   Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
   Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -1514,8 +1515,8 @@
   Value *A, *B;
   if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
       match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
-      !isFreeToInvert(A, A->hasOneUse()) &&
-      !isFreeToInvert(B, B->hasOneUse())) {
+      !InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
+      !InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
     Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
     return BinaryOperator::CreateNot(AndOr);
   }
@@ -1523,7 +1524,7 @@
   return nullptr;
 }
 
-bool InstCombiner::shouldOptimizeCast(CastInst *CI) {
+bool InstCombinerImpl::shouldOptimizeCast(CastInst *CI) {
   Value *CastSrc = CI->getOperand(0);
 
   // Noop casts and casts of constants should be eliminated trivially.
@@ -1579,7 +1580,7 @@
 }
 
 /// Fold {and,or,xor} (cast X), Y.
-Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   auto LogicOpc = I.getOpcode();
   assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
 
@@ -1725,7 +1726,7 @@
 
 /// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
 /// a common zext operand: and (binop (zext X), C), (zext X).
-Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
+Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) {
   // This transform could also apply to {or, and, xor}, but there are better
   // folds for those cases, so we don't expect those patterns here. AShr is not
   // handled because it should always be transformed to LShr in this sequence.
@@ -1767,7 +1768,7 @@
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -2033,7 +2034,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
+Instruction *InstCombinerImpl::matchBSwap(BinaryOperator &Or) {
   assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
   Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
 
@@ -2216,7 +2217,7 @@
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
+Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
   // Step 1: We may have peeked through bitcasts in the caller.
   // Exit immediately if we don't have (vector) integer types.
   Type *Ty = A->getType();
@@ -2273,8 +2274,8 @@
 
 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
-                                          Value *D) {
+Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+                                              Value *D) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
@@ -2294,8 +2295,8 @@
 }
 
 /// Fold (icmp)|(icmp) if possible.
-Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                   BinaryOperator &Or) {
+Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                       BinaryOperator &Or) {
   const SimplifyQuery Q = SQ.getWithInstruction(&Or);
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
@@ -2560,7 +2561,7 @@
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -2929,8 +2930,8 @@
   return nullptr;
 }
 
-Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    BinaryOperator &I) {
+Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                        BinaryOperator &I) {
   assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
          I.getOperand(1) == RHS && "Should be 'xor' with these operands");
 
@@ -3088,9 +3089,9 @@
     return nullptr;
 
   // We only want to do the transform if it is free to do.
-  if (isFreeToInvert(X, X->hasOneUse())) {
+  if (InstCombiner::isFreeToInvert(X, X->hasOneUse())) {
     // Ok, good.
-  } else if (isFreeToInvert(Y, Y->hasOneUse())) {
+  } else if (InstCombiner::isFreeToInvert(Y, Y->hasOneUse())) {
     std::swap(X, Y);
   } else
     return nullptr;
@@ -3102,7 +3103,7 @@
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -9,8 +9,10 @@
 // This file implements the visit functions for atomic rmw instructions.
 //
 //===----------------------------------------------------------------------===//
+
 #include "InstCombineInternal.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 
@@ -30,7 +32,7 @@
     default:
       return false;
     };
-  
+
   auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
   if(!C)
     return false;
@@ -93,11 +95,11 @@
 }
 }
 
-Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
+Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
 
   // Volatile RMWs perform a load and a store, we cannot replace this by just a
   // load or just a store. We chose not to canonicalize out of general paranoia
-  // about user expectations around volatile. 
+  // about user expectations around volatile.
   if (RMWI.isVolatile())
     return nullptr;
 
@@ -115,7 +117,7 @@
          "AtomicRMWs don't make sense with Unordered or NotAtomic");
 
   // Any atomicrmw xchg with no uses can be converted to a atomic store if the
-  // ordering is compatible. 
+  // ordering is compatible.
   if (RMWI.getOperation() == AtomicRMWInst::Xchg &&
       RMWI.use_empty()) {
     if (Ordering != AtomicOrdering::Release &&
@@ -127,14 +129,14 @@
     SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
     return eraseInstFromFunction(RMWI);
   }
-  
+
   if (!isIdempotentRMW(RMWI))
     return nullptr;
 
   // We chose to canonicalize all idempotent operations to an single
   // operation code and constant.  This makes it easier for the rest of the
   // optimizer to match easily.  The choices of or w/0 and fadd w/-0.0 are
-  // arbitrary. 
+  // arbitrary.
   if (RMWI.getType()->isIntegerTy() &&
       RMWI.getOperation() != AtomicRMWInst::Or) {
     RMWI.setOperation(AtomicRMWInst::Or);
@@ -149,7 +151,7 @@
   if (Ordering != AtomicOrdering::Acquire &&
       Ordering != AtomicOrdering::Monotonic)
     return nullptr;
-  
+
   LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
                                 false, DL.getABITypeAlign(RMWI.getType()),
                                 Ordering, RMWI.getSyncScopeID());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
@@ -47,9 +48,6 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/IntrinsicsPowerPC.h"
-#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
@@ -68,6 +66,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include <algorithm>
@@ -100,24 +99,7 @@
   return Ty;
 }
 
-/// Return a constant boolean vector that has true elements in all positions
-/// where the input constant data vector has an element with the sign bit set.
-static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
-  SmallVector<Constant *, 32> BoolVec;
-  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
-  for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
-    Constant *Elt = V->getElementAsConstant(I);
-    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
-           "Unexpected constant data vector element type");
-    bool Sign = V->getElementType()->isIntegerTy()
-                    ? cast<ConstantInt>(Elt)->isNegative()
-                    : cast<ConstantFP>(Elt)->isNegative();
-    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
-  }
-  return ConstantVector::get(BoolVec);
-}
-
-Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
+Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
   MaybeAlign CopyDstAlign = MI->getDestAlign();
   if (!CopyDstAlign || *CopyDstAlign < DstAlign) {
@@ -232,7 +214,7 @@
   return MI;
 }
 
-Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
+Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
   const Align KnownAlignment =
       getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
   MaybeAlign MemSetAlign = MI->getDestAlign();
@@ -292,820 +274,9 @@
   return nullptr;
 }
 
-static Value *simplifyX86immShift(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  bool LogicalShift = false;
-  bool ShiftLeft = false;
-  bool IsImm = false;
-
-  switch (II.getIntrinsicID()) {
-  default: llvm_unreachable("Unexpected intrinsic!");
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx512_psrai_q_128:
-  case Intrinsic::x86_avx512_psrai_q_256:
-  case Intrinsic::x86_avx512_psrai_d_512:
-  case Intrinsic::x86_avx512_psrai_q_512:
-  case Intrinsic::x86_avx512_psrai_w_512:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx512_psra_q_128:
-  case Intrinsic::x86_avx512_psra_q_256:
-  case Intrinsic::x86_avx512_psra_d_512:
-  case Intrinsic::x86_avx512_psra_q_512:
-  case Intrinsic::x86_avx512_psra_w_512:
-    LogicalShift = false;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_sse2_psrli_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx512_psrli_d_512:
-  case Intrinsic::x86_avx512_psrli_q_512:
-  case Intrinsic::x86_avx512_psrli_w_512:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case Intrinsic::x86_sse2_psrl_d:
-  case Intrinsic::x86_sse2_psrl_q:
-  case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_avx2_psrl_d:
-  case Intrinsic::x86_avx2_psrl_q:
-  case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx512_psrl_d_512:
-  case Intrinsic::x86_avx512_psrl_q_512:
-  case Intrinsic::x86_avx512_psrl_w_512:
-    LogicalShift = true;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_sse2_pslli_d:
-  case Intrinsic::x86_sse2_pslli_q:
-  case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx512_pslli_d_512:
-  case Intrinsic::x86_avx512_pslli_q_512:
-  case Intrinsic::x86_avx512_pslli_w_512:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_sse2_psll_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
-  case Intrinsic::x86_avx512_psll_d_512:
-  case Intrinsic::x86_avx512_psll_q_512:
-  case Intrinsic::x86_avx512_psll_w_512:
-    LogicalShift = true;
-    ShiftLeft = true;
-    break;
-  }
-  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
-
-  auto Vec = II.getArgOperand(0);
-  auto Amt = II.getArgOperand(1);
-  auto VT = cast<VectorType>(Vec->getType());
-  auto SVT = VT->getElementType();
-  auto AmtVT = Amt->getType();
-  unsigned VWidth = VT->getNumElements();
-  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
-
-  // If the shift amount is guaranteed to be in-range we can replace it with a
-  // generic shift. If its guaranteed to be out of range, logical shifts combine to
-  // zero and arithmetic shifts are clamped to (BitWidth - 1).
-  if (IsImm) {
-    assert(AmtVT ->isIntegerTy(32) &&
-           "Unexpected shift-by-immediate type");
-    KnownBits KnownAmtBits =
-        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
-    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
-      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
-      Amt = Builder.CreateVectorSplat(VWidth, Amt);
-      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
-                                        : Builder.CreateLShr(Vec, Amt))
-                           : Builder.CreateAShr(Vec, Amt));
-    }
-    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
-      if (LogicalShift)
-        return ConstantAggregateZero::get(VT);
-      Amt = ConstantInt::get(SVT, BitWidth - 1);
-      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
-    }
-  } else {
-    // Ensure the first element has an in-range value and the rest of the
-    // elements in the bottom 64 bits are zero.
-    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
-           cast<VectorType>(AmtVT)->getElementType() == SVT &&
-           "Unexpected shift-by-scalar type");
-    unsigned NumAmtElts = cast<VectorType>(AmtVT)->getNumElements();
-    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
-    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
-    KnownBits KnownLowerBits = llvm::computeKnownBits(
-        Amt, DemandedLower, II.getModule()->getDataLayout());
-    KnownBits KnownUpperBits = llvm::computeKnownBits(
-        Amt, DemandedUpper, II.getModule()->getDataLayout());
-    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
-        (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
-      SmallVector<int, 16> ZeroSplat(VWidth, 0);
-      Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat);
-      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
-                                        : Builder.CreateLShr(Vec, Amt))
-                           : Builder.CreateAShr(Vec, Amt));
-    }
-  }
-
-  // Simplify if count is constant vector.
-  auto CDV = dyn_cast<ConstantDataVector>(Amt);
-  if (!CDV)
-    return nullptr;
-
-  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
-  // operand to compute the shift amount.
-  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
-         cast<VectorType>(AmtVT)->getElementType() == SVT &&
-         "Unexpected shift-by-scalar type");
-
-  // Concatenate the sub-elements to create the 64-bit value.
-  APInt Count(64, 0);
-  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
-    unsigned SubEltIdx = (NumSubElts - 1) - i;
-    auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
-    Count <<= BitWidth;
-    Count |= SubElt->getValue().zextOrTrunc(64);
-  }
-
-  // If shift-by-zero then just return the original value.
-  if (Count.isNullValue())
-    return Vec;
-
-  // Handle cases when Shift >= BitWidth.
-  if (Count.uge(BitWidth)) {
-    // If LogicalShift - just return zero.
-    if (LogicalShift)
-      return ConstantAggregateZero::get(VT);
-
-    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
-    Count = APInt(64, BitWidth - 1);
-  }
-
-  // Get a constant vector of the same type as the first operand.
-  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
-  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
-
-  if (ShiftLeft)
-    return Builder.CreateShl(Vec, ShiftVec);
-
-  if (LogicalShift)
-    return Builder.CreateLShr(Vec, ShiftVec);
-
-  return Builder.CreateAShr(Vec, ShiftVec);
-}
-
-// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
-// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
-// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
-static Value *simplifyX86varShift(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  bool LogicalShift = false;
-  bool ShiftLeft = false;
-
-  switch (II.getIntrinsicID()) {
-  default: llvm_unreachable("Unexpected intrinsic!");
-  case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-  case Intrinsic::x86_avx512_psrav_q_128:
-  case Intrinsic::x86_avx512_psrav_q_256:
-  case Intrinsic::x86_avx512_psrav_d_512:
-  case Intrinsic::x86_avx512_psrav_q_512:
-  case Intrinsic::x86_avx512_psrav_w_128:
-  case Intrinsic::x86_avx512_psrav_w_256:
-  case Intrinsic::x86_avx512_psrav_w_512:
-    LogicalShift = false;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_avx2_psrlv_d:
-  case Intrinsic::x86_avx2_psrlv_d_256:
-  case Intrinsic::x86_avx2_psrlv_q:
-  case Intrinsic::x86_avx2_psrlv_q_256:
-  case Intrinsic::x86_avx512_psrlv_d_512:
-  case Intrinsic::x86_avx512_psrlv_q_512:
-  case Intrinsic::x86_avx512_psrlv_w_128:
-  case Intrinsic::x86_avx512_psrlv_w_256:
-  case Intrinsic::x86_avx512_psrlv_w_512:
-    LogicalShift = true;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_avx2_psllv_d:
-  case Intrinsic::x86_avx2_psllv_d_256:
-  case Intrinsic::x86_avx2_psllv_q:
-  case Intrinsic::x86_avx2_psllv_q_256:
-  case Intrinsic::x86_avx512_psllv_d_512:
-  case Intrinsic::x86_avx512_psllv_q_512:
-  case Intrinsic::x86_avx512_psllv_w_128:
-  case Intrinsic::x86_avx512_psllv_w_256:
-  case Intrinsic::x86_avx512_psllv_w_512:
-    LogicalShift = true;
-    ShiftLeft = true;
-    break;
-  }
-  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
-
-  auto Vec = II.getArgOperand(0);
-  auto Amt = II.getArgOperand(1);
-  auto VT = cast<VectorType>(II.getType());
-  auto SVT = VT->getElementType();
-  int NumElts = VT->getNumElements();
-  int BitWidth = SVT->getIntegerBitWidth();
-
-  // If the shift amount is guaranteed to be in-range we can replace it with a
-  // generic shift.
-  APInt UpperBits =
-      APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
-  if (llvm::MaskedValueIsZero(Amt, UpperBits,
-                              II.getModule()->getDataLayout())) {
-    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
-                                      : Builder.CreateLShr(Vec, Amt))
-                         : Builder.CreateAShr(Vec, Amt));
-  }
-
-  // Simplify if all shift amounts are constant/undef.
-  auto *CShift = dyn_cast<Constant>(Amt);
-  if (!CShift)
-    return nullptr;
-
-  // Collect each element's shift amount.
-  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
-  bool AnyOutOfRange = false;
-  SmallVector<int, 8> ShiftAmts;
-  for (int I = 0; I < NumElts; ++I) {
-    auto *CElt = CShift->getAggregateElement(I);
-    if (CElt && isa<UndefValue>(CElt)) {
-      ShiftAmts.push_back(-1);
-      continue;
-    }
-
-    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
-    if (!COp)
-      return nullptr;
-
-    // Handle out of range shifts.
-    // If LogicalShift - set to BitWidth (special case).
-    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
-    APInt ShiftVal = COp->getValue();
-    if (ShiftVal.uge(BitWidth)) {
-      AnyOutOfRange = LogicalShift;
-      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
-      continue;
-    }
-
-    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
-  }
-
-  // If all elements out of range or UNDEF, return vector of zeros/undefs.
-  // ArithmeticShift should only hit this if they are all UNDEF.
-  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
-  if (llvm::all_of(ShiftAmts, OutOfRange)) {
-    SmallVector<Constant *, 8> ConstantVec;
-    for (int Idx : ShiftAmts) {
-      if (Idx < 0) {
-        ConstantVec.push_back(UndefValue::get(SVT));
-      } else {
-        assert(LogicalShift && "Logical shift expected");
-        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
-      }
-    }
-    return ConstantVector::get(ConstantVec);
-  }
-
-  // We can't handle only some out of range values with generic logical shifts.
-  if (AnyOutOfRange)
-    return nullptr;
-
-  // Build the shift amount constant vector.
-  SmallVector<Constant *, 8> ShiftVecAmts;
-  for (int Idx : ShiftAmts) {
-    if (Idx < 0)
-      ShiftVecAmts.push_back(UndefValue::get(SVT));
-    else
-      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
-  }
-  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
-
-  if (ShiftLeft)
-    return Builder.CreateShl(Vec, ShiftVec);
-
-  if (LogicalShift)
-    return Builder.CreateLShr(Vec, ShiftVec);
-
-  return Builder.CreateAShr(Vec, ShiftVec);
-}
-
-static Value *simplifyX86pack(IntrinsicInst &II,
-                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
-  Value *Arg0 = II.getArgOperand(0);
-  Value *Arg1 = II.getArgOperand(1);
-  Type *ResTy = II.getType();
-
-  // Fast all undef handling.
-  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
-    return UndefValue::get(ResTy);
-
-  auto *ArgTy = cast<VectorType>(Arg0->getType());
-  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
-  unsigned NumSrcElts = ArgTy->getNumElements();
-  assert(cast<VectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
-         "Unexpected packing types");
-
-  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
-  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
-  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
-  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
-         "Unexpected packing types");
-
-  // Constant folding.
-  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
-    return nullptr;
-
-  // Clamp Values - signed/unsigned both use signed clamp values, but they
-  // differ on the min/max values.
-  APInt MinValue, MaxValue;
-  if (IsSigned) {
-    // PACKSS: Truncate signed value with signed saturation.
-    // Source values less than dst minint are saturated to minint.
-    // Source values greater than dst maxint are saturated to maxint.
-    MinValue =
-        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
-    MaxValue =
-        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
-  } else {
-    // PACKUS: Truncate signed value with unsigned saturation.
-    // Source values less than zero are saturated to zero.
-    // Source values greater than dst maxuint are saturated to maxuint.
-    MinValue = APInt::getNullValue(SrcScalarSizeInBits);
-    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
-  }
-
-  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
-  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
-  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
-  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
-  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
-  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
-
-  // Shuffle clamped args together at the lane level.
-  SmallVector<int, 32> PackMask;
-  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
-      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
-    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
-      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
-  }
-  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
-
-  // Truncate to dst size.
-  return Builder.CreateTrunc(Shuffle, ResTy);
-}
-
-static Value *simplifyX86movmsk(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  Value *Arg = II.getArgOperand(0);
-  Type *ResTy = II.getType();
-
-  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
-  if (isa<UndefValue>(Arg))
-    return Constant::getNullValue(ResTy);
-
-  auto *ArgTy = dyn_cast<VectorType>(Arg->getType());
-  // We can't easily peek through x86_mmx types.
-  if (!ArgTy)
-    return nullptr;
-
-  // Expand MOVMSK to compare/bitcast/zext:
-  // e.g. PMOVMSKB(v16i8 x):
-  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
-  // %int = bitcast <16 x i1> %cmp to i16
-  // %res = zext i16 %int to i32
-  unsigned NumElts = ArgTy->getNumElements();
-  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
-  Type *IntegerTy = Builder.getIntNTy(NumElts);
-
-  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
-  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
-  Res = Builder.CreateBitCast(Res, IntegerTy);
-  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
-  return Res;
-}
-
-static Value *simplifyX86addcarry(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  Value *CarryIn = II.getArgOperand(0);
-  Value *Op1 = II.getArgOperand(1);
-  Value *Op2 = II.getArgOperand(2);
-  Type *RetTy = II.getType();
-  Type *OpTy = Op1->getType();
-  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
-         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
-         "Unexpected types for x86 addcarry");
-
-  // If carry-in is zero, this is just an unsigned add with overflow.
-  if (match(CarryIn, m_ZeroInt())) {
-    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
-                                          { Op1, Op2 });
-    // The types have to be adjusted to match the x86 call types.
-    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
-    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
-                                       Builder.getInt8Ty());
-    Value *Res = UndefValue::get(RetTy);
-    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
-    return Builder.CreateInsertValue(Res, UAddResult, 1);
-  }
-
-  return nullptr;
-}
-
-static Value *simplifyX86insertps(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
-  if (!CInt)
-    return nullptr;
-
-  VectorType *VecTy = cast<VectorType>(II.getType());
-  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-
-  // The immediate permute control byte looks like this:
-  //    [3:0] - zero mask for each 32-bit lane
-  //    [5:4] - select one 32-bit destination lane
-  //    [7:6] - select one 32-bit source lane
-
-  uint8_t Imm = CInt->getZExtValue();
-  uint8_t ZMask = Imm & 0xf;
-  uint8_t DestLane = (Imm >> 4) & 0x3;
-  uint8_t SourceLane = (Imm >> 6) & 0x3;
-
-  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
-  // If all zero mask bits are set, this was just a weird way to
-  // generate a zero vector.
-  if (ZMask == 0xf)
-    return ZeroVector;
-
-  // Initialize by passing all of the first source bits through.
-  int ShuffleMask[4] = {0, 1, 2, 3};
-
-  // We may replace the second operand with the zero vector.
-  Value *V1 = II.getArgOperand(1);
-
-  if (ZMask) {
-    // If the zero mask is being used with a single input or the zero mask
-    // overrides the destination lane, this is a shuffle with the zero vector.
-    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
-        (ZMask & (1 << DestLane))) {
-      V1 = ZeroVector;
-      // We may still move 32-bits of the first source vector from one lane
-      // to another.
-      ShuffleMask[DestLane] = SourceLane;
-      // The zero mask may override the previous insert operation.
-      for (unsigned i = 0; i < 4; ++i)
-        if ((ZMask >> i) & 0x1)
-          ShuffleMask[i] = i + 4;
-    } else {
-      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
-      return nullptr;
-    }
-  } else {
-    // Replace the selected destination lane with the selected source lane.
-    ShuffleMask[DestLane] = SourceLane + 4;
-  }
-
-  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
-}
-
-/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
-/// or conversion to a shuffle vector.
-static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
-                               ConstantInt *CILength, ConstantInt *CIIndex,
-                               InstCombiner::BuilderTy &Builder) {
-  auto LowConstantHighUndef = [&](uint64_t Val) {
-    Type *IntTy64 = Type::getInt64Ty(II.getContext());
-    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
-                        UndefValue::get(IntTy64)};
-    return ConstantVector::get(Args);
-  };
-
-  // See if we're dealing with constant values.
-  Constant *C0 = dyn_cast<Constant>(Op0);
-  ConstantInt *CI0 =
-      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
-         : nullptr;
-
-  // Attempt to constant fold.
-  if (CILength && CIIndex) {
-    // From AMD documentation: "The bit index and field length are each six
-    // bits in length other bits of the field are ignored."
-    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
-    APInt APLength = CILength->getValue().zextOrTrunc(6);
-
-    unsigned Index = APIndex.getZExtValue();
-
-    // From AMD documentation: "a value of zero in the field length is
-    // defined as length of 64".
-    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
-
-    // From AMD documentation: "If the sum of the bit index + length field
-    // is greater than 64, the results are undefined".
-    unsigned End = Index + Length;
-
-    // Note that both field index and field length are 8-bit quantities.
-    // Since variables 'Index' and 'Length' are unsigned values
-    // obtained from zero-extending field index and field length
-    // respectively, their sum should never wrap around.
-    if (End > 64)
-      return UndefValue::get(II.getType());
-
-    // If we are inserting whole bytes, we can convert this to a shuffle.
-    // Lowering can recognize EXTRQI shuffle masks.
-    if ((Length % 8) == 0 && (Index % 8) == 0) {
-      // Convert bit indices to byte indices.
-      Length /= 8;
-      Index /= 8;
-
-      Type *IntTy8 = Type::getInt8Ty(II.getContext());
-      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
-
-      SmallVector<int, 16> ShuffleMask;
-      for (int i = 0; i != (int)Length; ++i)
-        ShuffleMask.push_back(i + Index);
-      for (int i = Length; i != 8; ++i)
-        ShuffleMask.push_back(i + 16);
-      for (int i = 8; i != 16; ++i)
-        ShuffleMask.push_back(-1);
-
-      Value *SV = Builder.CreateShuffleVector(
-          Builder.CreateBitCast(Op0, ShufTy),
-          ConstantAggregateZero::get(ShufTy), ShuffleMask);
-      return Builder.CreateBitCast(SV, II.getType());
-    }
-
-    // Constant Fold - shift Index'th bit to lowest position and mask off
-    // Length bits.
-    if (CI0) {
-      APInt Elt = CI0->getValue();
-      Elt.lshrInPlace(Index);
-      Elt = Elt.zextOrTrunc(Length);
-      return LowConstantHighUndef(Elt.getZExtValue());
-    }
-
-    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
-    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
-      Value *Args[] = {Op0, CILength, CIIndex};
-      Module *M = II.getModule();
-      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
-      return Builder.CreateCall(F, Args);
-    }
-  }
-
-  // Constant Fold - extraction from zero is always {zero, undef}.
-  if (CI0 && CI0->isZero())
-    return LowConstantHighUndef(0);
-
-  return nullptr;
-}
-
-/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
-/// folding or conversion to a shuffle vector.
-static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
-                                 APInt APLength, APInt APIndex,
-                                 InstCombiner::BuilderTy &Builder) {
-  // From AMD documentation: "The bit index and field length are each six bits
-  // in length other bits of the field are ignored."
-  APIndex = APIndex.zextOrTrunc(6);
-  APLength = APLength.zextOrTrunc(6);
-
-  // Attempt to constant fold.
-  unsigned Index = APIndex.getZExtValue();
-
-  // From AMD documentation: "a value of zero in the field length is
-  // defined as length of 64".
-  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
-
-  // From AMD documentation: "If the sum of the bit index + length field
-  // is greater than 64, the results are undefined".
-  unsigned End = Index + Length;
-
-  // Note that both field index and field length are 8-bit quantities.
-  // Since variables 'Index' and 'Length' are unsigned values
-  // obtained from zero-extending field index and field length
-  // respectively, their sum should never wrap around.
-  if (End > 64)
-    return UndefValue::get(II.getType());
-
-  // If we are inserting whole bytes, we can convert this to a shuffle.
-  // Lowering can recognize INSERTQI shuffle masks.
-  if ((Length % 8) == 0 && (Index % 8) == 0) {
-    // Convert bit indices to byte indices.
-    Length /= 8;
-    Index /= 8;
-
-    Type *IntTy8 = Type::getInt8Ty(II.getContext());
-    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
-
-    SmallVector<int, 16> ShuffleMask;
-    for (int i = 0; i != (int)Index; ++i)
-      ShuffleMask.push_back(i);
-    for (int i = 0; i != (int)Length; ++i)
-      ShuffleMask.push_back(i + 16);
-    for (int i = Index + Length; i != 8; ++i)
-      ShuffleMask.push_back(i);
-    for (int i = 8; i != 16; ++i)
-      ShuffleMask.push_back(-1);
-
-    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
-                                            Builder.CreateBitCast(Op1, ShufTy),
-                                            ShuffleMask);
-    return Builder.CreateBitCast(SV, II.getType());
-  }
-
-  // See if we're dealing with constant values.
-  Constant *C0 = dyn_cast<Constant>(Op0);
-  Constant *C1 = dyn_cast<Constant>(Op1);
-  ConstantInt *CI00 =
-      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
-         : nullptr;
-  ConstantInt *CI10 =
-      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
-         : nullptr;
-
-  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
-  if (CI00 && CI10) {
-    APInt V00 = CI00->getValue();
-    APInt V10 = CI10->getValue();
-    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
-    V00 = V00 & ~Mask;
-    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
-    APInt Val = V00 | V10;
-    Type *IntTy64 = Type::getInt64Ty(II.getContext());
-    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
-                        UndefValue::get(IntTy64)};
-    return ConstantVector::get(Args);
-  }
-
-  // If we were an INSERTQ call, we'll save demanded elements if we convert to
-  // INSERTQI.
-  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
-    Type *IntTy8 = Type::getInt8Ty(II.getContext());
-    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
-    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
-
-    Value *Args[] = {Op0, Op1, CILength, CIIndex};
-    Module *M = II.getModule();
-    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
-    return Builder.CreateCall(F, Args);
-  }
-
-  return nullptr;
-}
-
-/// Attempt to convert pshufb* to shufflevector if the mask is constant.
-static Value *simplifyX86pshufb(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!V)
-    return nullptr;
-
-  auto *VecTy = cast<VectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
-  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
-         "Unexpected number of elements in shuffle mask!");
-
-  // Construct a shuffle mask from constant integers or UNDEFs.
-  int Indexes[64];
-
-  // Each byte in the shuffle control mask forms an index to permute the
-  // corresponding byte in the destination operand.
-  for (unsigned I = 0; I < NumElts; ++I) {
-    Constant *COp = V->getAggregateElement(I);
-    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
-      return nullptr;
-
-    if (isa<UndefValue>(COp)) {
-      Indexes[I] = -1;
-      continue;
-    }
-
-    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
-
-    // If the most significant bit (bit[7]) of each byte of the shuffle
-    // control mask is set, then zero is written in the result byte.
-    // The zero vector is in the right-hand side of the resulting
-    // shufflevector.
-
-    // The value of each index for the high 128-bit lane is the least
-    // significant 4 bits of the respective shuffle control byte.
-    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
-    Indexes[I] = Index;
-  }
-
-  auto V1 = II.getArgOperand(0);
-  auto V2 = Constant::getNullValue(VecTy);
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
-}
-
-/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
-static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
-                                    InstCombiner::BuilderTy &Builder) {
-  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!V)
-    return nullptr;
-
-  auto *VecTy = cast<VectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
-  bool IsPD = VecTy->getScalarType()->isDoubleTy();
-  unsigned NumLaneElts = IsPD ? 2 : 4;
-  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
-
-  // Construct a shuffle mask from constant integers or UNDEFs.
-  int Indexes[16];
-
-  // The intrinsics only read one or two bits, clear the rest.
-  for (unsigned I = 0; I < NumElts; ++I) {
-    Constant *COp = V->getAggregateElement(I);
-    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
-      return nullptr;
-
-    if (isa<UndefValue>(COp)) {
-      Indexes[I] = -1;
-      continue;
-    }
-
-    APInt Index = cast<ConstantInt>(COp)->getValue();
-    Index = Index.zextOrTrunc(32).getLoBits(2);
-
-    // The PD variants uses bit 1 to select per-lane element index, so
-    // shift down to convert to generic shuffle mask index.
-    if (IsPD)
-      Index.lshrInPlace(1);
-
-    // The _256 variants are a bit trickier since the mask bits always index
-    // into the corresponding 128 half. In order to convert to a generic
-    // shuffle, we have to make that explicit.
-    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
-
-    Indexes[I] = Index.getZExtValue();
-  }
-
-  auto V1 = II.getArgOperand(0);
-  auto V2 = UndefValue::get(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
-}
-
-/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
-static Value *simplifyX86vpermv(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!V)
-    return nullptr;
-
-  auto *VecTy = cast<VectorType>(II.getType());
-  unsigned Size = VecTy->getNumElements();
-  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
-         "Unexpected shuffle mask size");
-
-  // Construct a shuffle mask from constant integers or UNDEFs.
-  int Indexes[64];
-
-  for (unsigned I = 0; I < Size; ++I) {
-    Constant *COp = V->getAggregateElement(I);
-    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
-      return nullptr;
-
-    if (isa<UndefValue>(COp)) {
-      Indexes[I] = -1;
-      continue;
-    }
-
-    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
-    Index &= Size - 1;
-    Indexes[I] = Index;
-  }
-
-  auto V1 = II.getArgOperand(0);
-  auto V2 = UndefValue::get(VecTy);
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size));
-}
-
 // TODO, Obvious Missing Transforms:
 // * Narrow width by halfs excluding zero/undef lanes
-Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
+Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
   Value *LoadPtr = II.getArgOperand(0);
   const Align Alignment =
       cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
@@ -1132,7 +303,7 @@
 // TODO, Obvious Missing Transforms:
 // * Single constant active lane -> store
 // * Narrow width by halfs excluding zero/undef lanes
-Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
+Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
   if (!ConstMask)
     return nullptr;
@@ -1165,7 +336,7 @@
 // * Narrow width by halfs excluding zero/undef lanes
 // * Vector splat address w/known mask -> scalar load
 // * Vector incrementing address -> vector masked load
-Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) {
+Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
   return nullptr;
 }
 
@@ -1175,7 +346,7 @@
 // * Narrow store width by halfs excluding zero/undef lanes
 // * Vector splat address w/known mask -> scalar store
 // * Vector incrementing address -> vector masked store
-Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
+Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
   if (!ConstMask)
     return nullptr;
@@ -1206,7 +377,7 @@
 /// This is legal because it preserves the most recent information about
 /// the presence or absence of invariant.group.
 static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
-                                                    InstCombiner &IC) {
+                                                    InstCombinerImpl &IC) {
   auto *Arg = II.getArgOperand(0);
   auto *StrippedArg = Arg->stripPointerCasts();
   auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
@@ -1231,7 +402,7 @@
   return cast<Instruction>(Result);
 }
 
-static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
+static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
   assert((II.getIntrinsicID() == Intrinsic::cttz ||
           II.getIntrinsicID() == Intrinsic::ctlz) &&
          "Expected cttz or ctlz intrinsic");
@@ -1301,7 +472,7 @@
   return nullptr;
 }
 
-static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
+static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   assert(II.getIntrinsicID() == Intrinsic::ctpop &&
          "Expected ctpop intrinsic");
   Type *Ty = II.getType();
@@ -1356,107 +527,6 @@
   return nullptr;
 }
 
-// TODO: If the x86 backend knew how to convert a bool vector mask back to an
-// XMM register mask efficiently, we could transform all x86 masked intrinsics
-// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
-static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
-  Value *Ptr = II.getOperand(0);
-  Value *Mask = II.getOperand(1);
-  Constant *ZeroVec = Constant::getNullValue(II.getType());
-
-  // Special case a zero mask since that's not a ConstantDataVector.
-  // This masked load instruction creates a zero vector.
-  if (isa<ConstantAggregateZero>(Mask))
-    return IC.replaceInstUsesWith(II, ZeroVec);
-
-  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
-  if (!ConstMask)
-    return nullptr;
-
-  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
-  // to allow target-independent optimizations.
-
-  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
-  // the LLVM intrinsic definition for the pointer argument.
-  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
-  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
-  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
-  // Second, convert the x86 XMM integer vector mask to a vector of bools based
-  // on each element's most significant bit (the sign bit).
-  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
-
-  // The pass-through vector for an x86 masked load is a zero vector.
-  CallInst *NewMaskedLoad =
-      IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
-  return IC.replaceInstUsesWith(II, NewMaskedLoad);
-}
-
-// TODO: If the x86 backend knew how to convert a bool vector mask back to an
-// XMM register mask efficiently, we could transform all x86 masked intrinsics
-// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
-static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
-  Value *Ptr = II.getOperand(0);
-  Value *Mask = II.getOperand(1);
-  Value *Vec = II.getOperand(2);
-
-  // Special case a zero mask since that's not a ConstantDataVector:
-  // this masked store instruction does nothing.
-  if (isa<ConstantAggregateZero>(Mask)) {
-    IC.eraseInstFromFunction(II);
-    return true;
-  }
-
-  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
-  // anything else at this level.
-  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
-    return false;
-
-  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
-  if (!ConstMask)
-    return false;
-
-  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
-  // to allow target-independent optimizations.
-
-  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
-  // the LLVM intrinsic definition for the pointer argument.
-  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
-  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
-  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
-  // Second, convert the x86 XMM integer vector mask to a vector of bools based
-  // on each element's most significant bit (the sign bit).
-  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
-
-  IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
-
-  // 'Replace uses' doesn't work for stores. Erase the original masked store.
-  IC.eraseInstFromFunction(II);
-  return true;
-}
-
-// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
-//
-// A single NaN input is folded to minnum, so we rely on that folding for
-// handling NaNs.
-static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
-                           const APFloat &Src2) {
-  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
-
-  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
-  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
-  if (Cmp0 == APFloat::cmpEqual)
-    return maxnum(Src1, Src2);
-
-  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
-  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
-  if (Cmp1 == APFloat::cmpEqual)
-    return maxnum(Src0, Src2);
-
-  return maxnum(Src0, Src1);
-}
-
 /// Convert a table lookup to shufflevector if the mask is constant.
 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
 /// which case we could lower the shufflevector with rev64 instructions
@@ -1495,28 +565,6 @@
   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes));
 }
 
-/// Convert a vector load intrinsic into a simple llvm load instruction.
-/// This is beneficial when the underlying object being addressed comes
-/// from a constant, since we get constant-folding for free.
-static Value *simplifyNeonVld1(const IntrinsicInst &II,
-                               unsigned MemAlign,
-                               InstCombiner::BuilderTy &Builder) {
-  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
-
-  if (!IntrAlign)
-    return nullptr;
-
-  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
-                       MemAlign : IntrAlign->getLimitedValue();
-
-  if (!isPowerOf2_32(Alignment))
-    return nullptr;
-
-  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
-                                          PointerType::get(II.getType(), 0));
-  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
-}
-
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1538,9 +586,9 @@
 //   call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed
 //   call @llvm.foo.end(i1 0)
 //   call @llvm.foo.end(i1 0) ; &I
-static bool removeTriviallyEmptyRange(
-    IntrinsicInst &EndI, InstCombiner &IC,
-    std::function<bool(const IntrinsicInst &)> IsStart) {
+static bool
+removeTriviallyEmptyRange(IntrinsicInst &EndI, InstCombinerImpl &IC,
+                          std::function<bool(const IntrinsicInst &)> IsStart) {
   // We start from the end intrinsic and scan backwards, so that InstCombine
   // has already processed (and potentially removed) all the instructions
   // before the end intrinsic.
@@ -1566,256 +614,7 @@
   return false;
 }
 
-// Convert NVVM intrinsics to target-generic LLVM code where possible.
-static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
-  // Each NVVM intrinsic we can simplify can be replaced with one of:
-  //
-  //  * an LLVM intrinsic,
-  //  * an LLVM cast operation,
-  //  * an LLVM binary operation, or
-  //  * ad-hoc LLVM IR for the particular operation.
-
-  // Some transformations are only valid when the module's
-  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
-  // transformations are valid regardless of the module's ftz setting.
-  enum FtzRequirementTy {
-    FTZ_Any,       // Any ftz setting is ok.
-    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
-    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
-  };
-  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
-  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
-  // simplify.
-  enum SpecialCase {
-    SPC_Reciprocal,
-  };
-
-  // SimplifyAction is a poor-man's variant (plus an additional flag) that
-  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
-  struct SimplifyAction {
-    // Invariant: At most one of these Optionals has a value.
-    Optional<Intrinsic::ID> IID;
-    Optional<Instruction::CastOps> CastOp;
-    Optional<Instruction::BinaryOps> BinaryOp;
-    Optional<SpecialCase> Special;
-
-    FtzRequirementTy FtzRequirement = FTZ_Any;
-
-    SimplifyAction() = default;
-
-    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
-        : IID(IID), FtzRequirement(FtzReq) {}
-
-    // Cast operations don't have anything to do with FTZ, so we skip that
-    // argument.
-    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
-
-    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
-        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
-
-    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
-        : Special(Special), FtzRequirement(FtzReq) {}
-  };
-
-  // Try to generate a SimplifyAction describing how to replace our
-  // IntrinsicInstr with target-generic LLVM IR.
-  const SimplifyAction Action = [II]() -> SimplifyAction {
-    switch (II->getIntrinsicID()) {
-    // NVVM intrinsics that map directly to LLVM intrinsics.
-    case Intrinsic::nvvm_ceil_d:
-      return {Intrinsic::ceil, FTZ_Any};
-    case Intrinsic::nvvm_ceil_f:
-      return {Intrinsic::ceil, FTZ_MustBeOff};
-    case Intrinsic::nvvm_ceil_ftz_f:
-      return {Intrinsic::ceil, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fabs_d:
-      return {Intrinsic::fabs, FTZ_Any};
-    case Intrinsic::nvvm_fabs_f:
-      return {Intrinsic::fabs, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fabs_ftz_f:
-      return {Intrinsic::fabs, FTZ_MustBeOn};
-    case Intrinsic::nvvm_floor_d:
-      return {Intrinsic::floor, FTZ_Any};
-    case Intrinsic::nvvm_floor_f:
-      return {Intrinsic::floor, FTZ_MustBeOff};
-    case Intrinsic::nvvm_floor_ftz_f:
-      return {Intrinsic::floor, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fma_rn_d:
-      return {Intrinsic::fma, FTZ_Any};
-    case Intrinsic::nvvm_fma_rn_f:
-      return {Intrinsic::fma, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fma_rn_ftz_f:
-      return {Intrinsic::fma, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fmax_d:
-      return {Intrinsic::maxnum, FTZ_Any};
-    case Intrinsic::nvvm_fmax_f:
-      return {Intrinsic::maxnum, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fmax_ftz_f:
-      return {Intrinsic::maxnum, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fmin_d:
-      return {Intrinsic::minnum, FTZ_Any};
-    case Intrinsic::nvvm_fmin_f:
-      return {Intrinsic::minnum, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fmin_ftz_f:
-      return {Intrinsic::minnum, FTZ_MustBeOn};
-    case Intrinsic::nvvm_round_d:
-      return {Intrinsic::round, FTZ_Any};
-    case Intrinsic::nvvm_round_f:
-      return {Intrinsic::round, FTZ_MustBeOff};
-    case Intrinsic::nvvm_round_ftz_f:
-      return {Intrinsic::round, FTZ_MustBeOn};
-    case Intrinsic::nvvm_sqrt_rn_d:
-      return {Intrinsic::sqrt, FTZ_Any};
-    case Intrinsic::nvvm_sqrt_f:
-      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
-      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
-      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
-      // the versions with explicit ftz-ness.
-      return {Intrinsic::sqrt, FTZ_Any};
-    case Intrinsic::nvvm_sqrt_rn_f:
-      return {Intrinsic::sqrt, FTZ_MustBeOff};
-    case Intrinsic::nvvm_sqrt_rn_ftz_f:
-      return {Intrinsic::sqrt, FTZ_MustBeOn};
-    case Intrinsic::nvvm_trunc_d:
-      return {Intrinsic::trunc, FTZ_Any};
-    case Intrinsic::nvvm_trunc_f:
-      return {Intrinsic::trunc, FTZ_MustBeOff};
-    case Intrinsic::nvvm_trunc_ftz_f:
-      return {Intrinsic::trunc, FTZ_MustBeOn};
-
-    // NVVM intrinsics that map to LLVM cast operations.
-    //
-    // Note that llvm's target-generic conversion operators correspond to the rz
-    // (round to zero) versions of the nvvm conversion intrinsics, even though
-    // most everything else here uses the rn (round to nearest even) nvvm ops.
-    case Intrinsic::nvvm_d2i_rz:
-    case Intrinsic::nvvm_f2i_rz:
-    case Intrinsic::nvvm_d2ll_rz:
-    case Intrinsic::nvvm_f2ll_rz:
-      return {Instruction::FPToSI};
-    case Intrinsic::nvvm_d2ui_rz:
-    case Intrinsic::nvvm_f2ui_rz:
-    case Intrinsic::nvvm_d2ull_rz:
-    case Intrinsic::nvvm_f2ull_rz:
-      return {Instruction::FPToUI};
-    case Intrinsic::nvvm_i2d_rz:
-    case Intrinsic::nvvm_i2f_rz:
-    case Intrinsic::nvvm_ll2d_rz:
-    case Intrinsic::nvvm_ll2f_rz:
-      return {Instruction::SIToFP};
-    case Intrinsic::nvvm_ui2d_rz:
-    case Intrinsic::nvvm_ui2f_rz:
-    case Intrinsic::nvvm_ull2d_rz:
-    case Intrinsic::nvvm_ull2f_rz:
-      return {Instruction::UIToFP};
-
-    // NVVM intrinsics that map to LLVM binary ops.
-    case Intrinsic::nvvm_add_rn_d:
-      return {Instruction::FAdd, FTZ_Any};
-    case Intrinsic::nvvm_add_rn_f:
-      return {Instruction::FAdd, FTZ_MustBeOff};
-    case Intrinsic::nvvm_add_rn_ftz_f:
-      return {Instruction::FAdd, FTZ_MustBeOn};
-    case Intrinsic::nvvm_mul_rn_d:
-      return {Instruction::FMul, FTZ_Any};
-    case Intrinsic::nvvm_mul_rn_f:
-      return {Instruction::FMul, FTZ_MustBeOff};
-    case Intrinsic::nvvm_mul_rn_ftz_f:
-      return {Instruction::FMul, FTZ_MustBeOn};
-    case Intrinsic::nvvm_div_rn_d:
-      return {Instruction::FDiv, FTZ_Any};
-    case Intrinsic::nvvm_div_rn_f:
-      return {Instruction::FDiv, FTZ_MustBeOff};
-    case Intrinsic::nvvm_div_rn_ftz_f:
-      return {Instruction::FDiv, FTZ_MustBeOn};
-
-    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
-    // need special handling.
-    //
-    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
-    // as well.
-    case Intrinsic::nvvm_rcp_rn_d:
-      return {SPC_Reciprocal, FTZ_Any};
-    case Intrinsic::nvvm_rcp_rn_f:
-      return {SPC_Reciprocal, FTZ_MustBeOff};
-    case Intrinsic::nvvm_rcp_rn_ftz_f:
-      return {SPC_Reciprocal, FTZ_MustBeOn};
-
-    // We do not currently simplify intrinsics that give an approximate answer.
-    // These include:
-    //
-    //   - nvvm_cos_approx_{f,ftz_f}
-    //   - nvvm_ex2_approx_{d,f,ftz_f}
-    //   - nvvm_lg2_approx_{d,f,ftz_f}
-    //   - nvvm_sin_approx_{f,ftz_f}
-    //   - nvvm_sqrt_approx_{f,ftz_f}
-    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
-    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
-    //   - nvvm_rcp_approx_ftz_d
-    //
-    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
-    // means that fastmath is enabled in the intrinsic.  Unfortunately only
-    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
-    // information gets lost and we can't select on it.
-    //
-    // TODO: div and rcp are lowered to a binary op, so these we could in theory
-    // lower them to "fast fdiv".
-
-    default:
-      return {};
-    }
-  }();
-
-  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
-  // can bail out now.  (Notice that in the case that IID is not an NVVM
-  // intrinsic, we don't have to look up any module metadata, as
-  // FtzRequirementTy will be FTZ_Any.)
-  if (Action.FtzRequirement != FTZ_Any) {
-    StringRef Attr = II->getFunction()
-                         ->getFnAttribute("denormal-fp-math-f32")
-                         .getValueAsString();
-    DenormalMode Mode = parseDenormalFPAttribute(Attr);
-    bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
-
-    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
-      return nullptr;
-  }
-
-  // Simplify to target-generic intrinsic.
-  if (Action.IID) {
-    SmallVector<Value *, 4> Args(II->arg_operands());
-    // All the target-generic intrinsics currently of interest to us have one
-    // type argument, equal to that of the nvvm intrinsic's argument.
-    Type *Tys[] = {II->getArgOperand(0)->getType()};
-    return CallInst::Create(
-        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
-  }
-
-  // Simplify to target-generic binary op.
-  if (Action.BinaryOp)
-    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
-                                  II->getArgOperand(1), II->getName());
-
-  // Simplify to target-generic cast op.
-  if (Action.CastOp)
-    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
-                            II->getName());
-
-  // All that's left are the special cases.
-  if (!Action.Special)
-    return nullptr;
-
-  switch (*Action.Special) {
-  case SPC_Reciprocal:
-    // Simplify reciprocal.
-    return BinaryOperator::Create(
-        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
-        II->getArgOperand(0), II->getName());
-  }
-  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
-}
-
-Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) {
+Instruction *InstCombinerImpl::visitVAEndInst(VAEndInst &I) {
   removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) {
     return I.getIntrinsicID() == Intrinsic::vastart ||
            I.getIntrinsicID() == Intrinsic::vacopy;
@@ -1834,20 +633,31 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
+/// Creates a result tuple for an overflow intrinsic \p II with a given
+/// \p Result and a constant \p Overflow value.
+static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
+                                        Constant *Overflow) {
+  Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
+  StructType *ST = cast<StructType>(II->getType());
+  Constant *Struct = ConstantStruct::get(ST, V);
+  return InsertValueInst::Create(Struct, Result, 0);
+}
+
+Instruction *
+InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   WithOverflowInst *WO = cast<WithOverflowInst>(II);
   Value *OperationResult = nullptr;
   Constant *OverflowResult = nullptr;
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
-    return CreateOverflowTuple(WO, OperationResult, OverflowResult);
+    return createOverflowTuple(WO, OperationResult, OverflowResult);
   return nullptr;
 }
 
 /// CallInst simplification. This mostly only handles folding of intrinsic
 /// instructions. For normal calls, it allows visitCallBase to do the heavy
 /// lifting.
-Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
   if (!CI.use_empty())
@@ -1953,19 +763,8 @@
     }
   }
 
-  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
-    return I;
-
-  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
-                                              unsigned DemandedWidth) {
-    APInt UndefElts(Width, 0);
-    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
-    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
-  };
-
   Intrinsic::ID IID = II->getIntrinsicID();
   switch (IID) {
-  default: break;
   case Intrinsic::objectsize:
     if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
       return replaceInstUsesWith(CI, V);
@@ -2465,932 +1264,6 @@
     }
     break;
   }
-  case Intrinsic::ppc_altivec_lvx:
-  case Intrinsic::ppc_altivec_lvxl:
-    // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                         PointerType::getUnqual(II->getType()));
-      return new LoadInst(II->getType(), Ptr, "", false, Align(16));
-    }
-    break;
-  case Intrinsic::ppc_vsx_lxvw4x:
-  case Intrinsic::ppc_vsx_lxvd2x: {
-    // Turn PPC VSX loads into normal loads.
-    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                       PointerType::getUnqual(II->getType()));
-    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1));
-  }
-  case Intrinsic::ppc_altivec_stvx:
-  case Intrinsic::ppc_altivec_stvxl:
-    // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Type *OpPtrTy =
-        PointerType::getUnqual(II->getArgOperand(0)->getType());
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(II->getArgOperand(0), Ptr, false, Align(16));
-    }
-    break;
-  case Intrinsic::ppc_vsx_stxvw4x:
-  case Intrinsic::ppc_vsx_stxvd2x: {
-    // Turn PPC VSX stores into normal stores.
-    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
-    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-    return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1));
-  }
-  case Intrinsic::ppc_qpx_qvlfs:
-    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Type *VTy =
-          VectorType::get(Builder.getFloatTy(),
-                          cast<VectorType>(II->getType())->getElementCount());
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                         PointerType::getUnqual(VTy));
-      Value *Load = Builder.CreateLoad(VTy, Ptr);
-      return new FPExtInst(Load, II->getType());
-    }
-    break;
-  case Intrinsic::ppc_qpx_qvlfd:
-    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(32), DL, II, &AC,
-                                   &DT) >= 32) {
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                         PointerType::getUnqual(II->getType()));
-      return new LoadInst(II->getType(), Ptr, "", false, Align(32));
-    }
-    break;
-  case Intrinsic::ppc_qpx_qvstfs:
-    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Type *VTy = VectorType::get(
-          Builder.getFloatTy(),
-          cast<VectorType>(II->getArgOperand(0)->getType())->getElementCount());
-      Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
-      Type *OpPtrTy = PointerType::getUnqual(VTy);
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(TOp, Ptr, false, Align(16));
-    }
-    break;
-  case Intrinsic::ppc_qpx_qvstfd:
-    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(32), DL, II, &AC,
-                                   &DT) >= 32) {
-      Type *OpPtrTy =
-        PointerType::getUnqual(II->getArgOperand(0)->getType());
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(II->getArgOperand(0), Ptr, false, Align(32));
-    }
-    break;
-
-  case Intrinsic::x86_bmi_bextr_32:
-  case Intrinsic::x86_bmi_bextr_64:
-  case Intrinsic::x86_tbm_bextri_u32:
-  case Intrinsic::x86_tbm_bextri_u64:
-    // If the RHS is a constant we can try some simplifications.
-    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      uint64_t Shift = C->getZExtValue();
-      uint64_t Length = (Shift >> 8) & 0xff;
-      Shift &= 0xff;
-      unsigned BitWidth = II->getType()->getIntegerBitWidth();
-      // If the length is 0 or the shift is out of range, replace with zero.
-      if (Length == 0 || Shift >= BitWidth)
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      // If the LHS is also a constant, we can completely constant fold this.
-      if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Result = InC->getZExtValue() >> Shift;
-        if (Length > BitWidth)
-          Length = BitWidth;
-        Result &= maskTrailingOnes<uint64_t>(Length);
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
-      // are only masking bits that a shift already cleared?
-    }
-    break;
-
-  case Intrinsic::x86_bmi_bzhi_32:
-  case Intrinsic::x86_bmi_bzhi_64:
-    // If the RHS is a constant we can try some simplifications.
-    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      uint64_t Index = C->getZExtValue() & 0xff;
-      unsigned BitWidth = II->getType()->getIntegerBitWidth();
-      if (Index >= BitWidth)
-        return replaceInstUsesWith(CI, II->getArgOperand(0));
-      if (Index == 0)
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      // If the LHS is also a constant, we can completely constant fold this.
-      if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Result = InC->getZExtValue();
-        Result &= maskTrailingOnes<uint64_t>(Index);
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-      // TODO should we convert this to an AND if the RHS is constant?
-    }
-    break;
-  case Intrinsic::x86_bmi_pext_32:
-  case Intrinsic::x86_bmi_pext_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      if (MaskC->isNullValue())
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      if (MaskC->isAllOnesValue())
-        return replaceInstUsesWith(CI, II->getArgOperand(0));
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToSet = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToTest = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToSet <<= 1;
-          // Clear lowest set bit.
-          Mask &= Mask - 1;
-        }
-
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-    }
-    break;
-  case Intrinsic::x86_bmi_pdep_32:
-  case Intrinsic::x86_bmi_pdep_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      if (MaskC->isNullValue())
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      if (MaskC->isAllOnesValue())
-        return replaceInstUsesWith(CI, II->getArgOperand(0));
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToTest = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToSet = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToTest <<= 1;
-          // Clear lowest set bit;
-          Mask &= Mask - 1;
-        }
-
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-    }
-    break;
-
-  case Intrinsic::x86_sse_cvtss2si:
-  case Intrinsic::x86_sse_cvtss2si64:
-  case Intrinsic::x86_sse_cvttss2si:
-  case Intrinsic::x86_sse_cvttss2si64:
-  case Intrinsic::x86_sse2_cvtsd2si:
-  case Intrinsic::x86_sse2_cvtsd2si64:
-  case Intrinsic::x86_sse2_cvttsd2si:
-  case Intrinsic::x86_sse2_cvttsd2si64:
-  case Intrinsic::x86_avx512_vcvtss2si32:
-  case Intrinsic::x86_avx512_vcvtss2si64:
-  case Intrinsic::x86_avx512_vcvtss2usi32:
-  case Intrinsic::x86_avx512_vcvtss2usi64:
-  case Intrinsic::x86_avx512_vcvtsd2si32:
-  case Intrinsic::x86_avx512_vcvtsd2si64:
-  case Intrinsic::x86_avx512_vcvtsd2usi32:
-  case Intrinsic::x86_avx512_vcvtsd2usi64:
-  case Intrinsic::x86_avx512_cvttss2si:
-  case Intrinsic::x86_avx512_cvttss2si64:
-  case Intrinsic::x86_avx512_cvttss2usi:
-  case Intrinsic::x86_avx512_cvttss2usi64:
-  case Intrinsic::x86_avx512_cvttsd2si:
-  case Intrinsic::x86_avx512_cvttsd2si64:
-  case Intrinsic::x86_avx512_cvttsd2usi:
-  case Intrinsic::x86_avx512_cvttsd2usi64: {
-    // These intrinsics only demand the 0th element of their input vectors. If
-    // we can simplify the input based on that, do so now.
-    Value *Arg = II->getArgOperand(0);
-    unsigned VWidth = cast<VectorType>(Arg->getType())->getNumElements();
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1))
-      return replaceOperand(*II, 0, V);
-    break;
-  }
-
-  case Intrinsic::x86_mmx_pmovmskb:
-  case Intrinsic::x86_sse_movmsk_ps:
-  case Intrinsic::x86_sse2_movmsk_pd:
-  case Intrinsic::x86_sse2_pmovmskb_128:
-  case Intrinsic::x86_avx_movmsk_pd_256:
-  case Intrinsic::x86_avx_movmsk_ps_256:
-  case Intrinsic::x86_avx2_pmovmskb:
-    if (Value *V = simplifyX86movmsk(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse_comieq_ss:
-  case Intrinsic::x86_sse_comige_ss:
-  case Intrinsic::x86_sse_comigt_ss:
-  case Intrinsic::x86_sse_comile_ss:
-  case Intrinsic::x86_sse_comilt_ss:
-  case Intrinsic::x86_sse_comineq_ss:
-  case Intrinsic::x86_sse_ucomieq_ss:
-  case Intrinsic::x86_sse_ucomige_ss:
-  case Intrinsic::x86_sse_ucomigt_ss:
-  case Intrinsic::x86_sse_ucomile_ss:
-  case Intrinsic::x86_sse_ucomilt_ss:
-  case Intrinsic::x86_sse_ucomineq_ss:
-  case Intrinsic::x86_sse2_comieq_sd:
-  case Intrinsic::x86_sse2_comige_sd:
-  case Intrinsic::x86_sse2_comigt_sd:
-  case Intrinsic::x86_sse2_comile_sd:
-  case Intrinsic::x86_sse2_comilt_sd:
-  case Intrinsic::x86_sse2_comineq_sd:
-  case Intrinsic::x86_sse2_ucomieq_sd:
-  case Intrinsic::x86_sse2_ucomige_sd:
-  case Intrinsic::x86_sse2_ucomigt_sd:
-  case Intrinsic::x86_sse2_ucomile_sd:
-  case Intrinsic::x86_sse2_ucomilt_sd:
-  case Intrinsic::x86_sse2_ucomineq_sd:
-  case Intrinsic::x86_avx512_vcomi_ss:
-  case Intrinsic::x86_avx512_vcomi_sd:
-  case Intrinsic::x86_avx512_mask_cmp_ss:
-  case Intrinsic::x86_avx512_mask_cmp_sd: {
-    // These intrinsics only demand the 0th element of their input vectors. If
-    // we can simplify the input based on that, do so now.
-    bool MadeChange = false;
-    Value *Arg0 = II->getArgOperand(0);
-    Value *Arg1 = II->getArgOperand(1);
-    unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
-      replaceOperand(*II, 0, V);
-      MadeChange = true;
-    }
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
-      replaceOperand(*II, 1, V);
-      MadeChange = true;
-    }
-    if (MadeChange)
-      return II;
-    break;
-  }
-  case Intrinsic::x86_avx512_cmp_pd_128:
-  case Intrinsic::x86_avx512_cmp_pd_256:
-  case Intrinsic::x86_avx512_cmp_pd_512:
-  case Intrinsic::x86_avx512_cmp_ps_128:
-  case Intrinsic::x86_avx512_cmp_ps_256:
-  case Intrinsic::x86_avx512_cmp_ps_512: {
-    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
-    Value *Arg0 = II->getArgOperand(0);
-    Value *Arg1 = II->getArgOperand(1);
-    bool Arg0IsZero = match(Arg0, m_PosZeroFP());
-    if (Arg0IsZero)
-      std::swap(Arg0, Arg1);
-    Value *A, *B;
-    // This fold requires only the NINF(not +/- inf) since inf minus
-    // inf is nan.
-    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
-    // equal for both compares.
-    // NNAN is not needed because nans compare the same for both compares.
-    // The compare intrinsic uses the above assumptions and therefore
-    // doesn't require additional flags.
-    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
-         match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
-         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
-      if (Arg0IsZero)
-        std::swap(A, B);
-      replaceOperand(*II, 0, A);
-      replaceOperand(*II, 1, B);
-      return II;
-    }
-    break;
-  }
-
-  case Intrinsic::x86_avx512_add_ps_512:
-  case Intrinsic::x86_avx512_div_ps_512:
-  case Intrinsic::x86_avx512_mul_ps_512:
-  case Intrinsic::x86_avx512_sub_ps_512:
-  case Intrinsic::x86_avx512_add_pd_512:
-  case Intrinsic::x86_avx512_div_pd_512:
-  case Intrinsic::x86_avx512_mul_pd_512:
-  case Intrinsic::x86_avx512_sub_pd_512:
-    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
-    // IR operations.
-    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-      if (R->getValue() == 4) {
-        Value *Arg0 = II->getArgOperand(0);
-        Value *Arg1 = II->getArgOperand(1);
-
-        Value *V;
-        switch (IID) {
-        default: llvm_unreachable("Case stmts out of sync!");
-        case Intrinsic::x86_avx512_add_ps_512:
-        case Intrinsic::x86_avx512_add_pd_512:
-          V = Builder.CreateFAdd(Arg0, Arg1);
-          break;
-        case Intrinsic::x86_avx512_sub_ps_512:
-        case Intrinsic::x86_avx512_sub_pd_512:
-          V = Builder.CreateFSub(Arg0, Arg1);
-          break;
-        case Intrinsic::x86_avx512_mul_ps_512:
-        case Intrinsic::x86_avx512_mul_pd_512:
-          V = Builder.CreateFMul(Arg0, Arg1);
-          break;
-        case Intrinsic::x86_avx512_div_ps_512:
-        case Intrinsic::x86_avx512_div_pd_512:
-          V = Builder.CreateFDiv(Arg0, Arg1);
-          break;
-        }
-
-        return replaceInstUsesWith(*II, V);
-      }
-    }
-    break;
-
-  case Intrinsic::x86_avx512_mask_add_ss_round:
-  case Intrinsic::x86_avx512_mask_div_ss_round:
-  case Intrinsic::x86_avx512_mask_mul_ss_round:
-  case Intrinsic::x86_avx512_mask_sub_ss_round:
-  case Intrinsic::x86_avx512_mask_add_sd_round:
-  case Intrinsic::x86_avx512_mask_div_sd_round:
-  case Intrinsic::x86_avx512_mask_mul_sd_round:
-  case Intrinsic::x86_avx512_mask_sub_sd_round:
-    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
-    // IR operations.
-    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
-      if (R->getValue() == 4) {
-        // Extract the element as scalars.
-        Value *Arg0 = II->getArgOperand(0);
-        Value *Arg1 = II->getArgOperand(1);
-        Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
-        Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
-
-        Value *V;
-        switch (IID) {
-        default: llvm_unreachable("Case stmts out of sync!");
-        case Intrinsic::x86_avx512_mask_add_ss_round:
-        case Intrinsic::x86_avx512_mask_add_sd_round:
-          V = Builder.CreateFAdd(LHS, RHS);
-          break;
-        case Intrinsic::x86_avx512_mask_sub_ss_round:
-        case Intrinsic::x86_avx512_mask_sub_sd_round:
-          V = Builder.CreateFSub(LHS, RHS);
-          break;
-        case Intrinsic::x86_avx512_mask_mul_ss_round:
-        case Intrinsic::x86_avx512_mask_mul_sd_round:
-          V = Builder.CreateFMul(LHS, RHS);
-          break;
-        case Intrinsic::x86_avx512_mask_div_ss_round:
-        case Intrinsic::x86_avx512_mask_div_sd_round:
-          V = Builder.CreateFDiv(LHS, RHS);
-          break;
-        }
-
-        // Handle the masking aspect of the intrinsic.
-        Value *Mask = II->getArgOperand(3);
-        auto *C = dyn_cast<ConstantInt>(Mask);
-        // We don't need a select if we know the mask bit is a 1.
-        if (!C || !C->getValue()[0]) {
-          // Cast the mask to an i1 vector and then extract the lowest element.
-          auto *MaskTy = FixedVectorType::get(
-              Builder.getInt1Ty(),
-              cast<IntegerType>(Mask->getType())->getBitWidth());
-          Mask = Builder.CreateBitCast(Mask, MaskTy);
-          Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
-          // Extract the lowest element from the passthru operand.
-          Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
-                                                          (uint64_t)0);
-          V = Builder.CreateSelect(Mask, V, Passthru);
-        }
-
-        // Insert the result back into the original argument 0.
-        V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
-
-        return replaceInstUsesWith(*II, V);
-      }
-    }
-    break;
-
-  // Constant fold ashr( <A x Bi>, Ci ).
-  // Constant fold lshr( <A x Bi>, Ci ).
-  // Constant fold shl( <A x Bi>, Ci ).
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx512_psrai_q_128:
-  case Intrinsic::x86_avx512_psrai_q_256:
-  case Intrinsic::x86_avx512_psrai_d_512:
-  case Intrinsic::x86_avx512_psrai_q_512:
-  case Intrinsic::x86_avx512_psrai_w_512:
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_sse2_psrli_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx512_psrli_d_512:
-  case Intrinsic::x86_avx512_psrli_q_512:
-  case Intrinsic::x86_avx512_psrli_w_512:
-  case Intrinsic::x86_sse2_pslli_d:
-  case Intrinsic::x86_sse2_pslli_q:
-  case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx512_pslli_d_512:
-  case Intrinsic::x86_avx512_pslli_q_512:
-  case Intrinsic::x86_avx512_pslli_w_512:
-    if (Value *V = simplifyX86immShift(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx512_psra_q_128:
-  case Intrinsic::x86_avx512_psra_q_256:
-  case Intrinsic::x86_avx512_psra_d_512:
-  case Intrinsic::x86_avx512_psra_q_512:
-  case Intrinsic::x86_avx512_psra_w_512:
-  case Intrinsic::x86_sse2_psrl_d:
-  case Intrinsic::x86_sse2_psrl_q:
-  case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_avx2_psrl_d:
-  case Intrinsic::x86_avx2_psrl_q:
-  case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx512_psrl_d_512:
-  case Intrinsic::x86_avx512_psrl_q_512:
-  case Intrinsic::x86_avx512_psrl_w_512:
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_sse2_psll_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
-  case Intrinsic::x86_avx512_psll_d_512:
-  case Intrinsic::x86_avx512_psll_q_512:
-  case Intrinsic::x86_avx512_psll_w_512: {
-    if (Value *V = simplifyX86immShift(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
-    // operand to compute the shift amount.
-    Value *Arg1 = II->getArgOperand(1);
-    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
-           "Unexpected packed shift size");
-    unsigned VWidth = cast<VectorType>(Arg1->getType())->getNumElements();
-
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2))
-      return replaceOperand(*II, 1, V);
-    break;
-  }
-
-  case Intrinsic::x86_avx2_psllv_d:
-  case Intrinsic::x86_avx2_psllv_d_256:
-  case Intrinsic::x86_avx2_psllv_q:
-  case Intrinsic::x86_avx2_psllv_q_256:
-  case Intrinsic::x86_avx512_psllv_d_512:
-  case Intrinsic::x86_avx512_psllv_q_512:
-  case Intrinsic::x86_avx512_psllv_w_128:
-  case Intrinsic::x86_avx512_psllv_w_256:
-  case Intrinsic::x86_avx512_psllv_w_512:
-  case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-  case Intrinsic::x86_avx512_psrav_q_128:
-  case Intrinsic::x86_avx512_psrav_q_256:
-  case Intrinsic::x86_avx512_psrav_d_512:
-  case Intrinsic::x86_avx512_psrav_q_512:
-  case Intrinsic::x86_avx512_psrav_w_128:
-  case Intrinsic::x86_avx512_psrav_w_256:
-  case Intrinsic::x86_avx512_psrav_w_512:
-  case Intrinsic::x86_avx2_psrlv_d:
-  case Intrinsic::x86_avx2_psrlv_d_256:
-  case Intrinsic::x86_avx2_psrlv_q:
-  case Intrinsic::x86_avx2_psrlv_q_256:
-  case Intrinsic::x86_avx512_psrlv_d_512:
-  case Intrinsic::x86_avx512_psrlv_q_512:
-  case Intrinsic::x86_avx512_psrlv_w_128:
-  case Intrinsic::x86_avx512_psrlv_w_256:
-  case Intrinsic::x86_avx512_psrlv_w_512:
-    if (Value *V = simplifyX86varShift(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_packssdw_128:
-  case Intrinsic::x86_sse2_packsswb_128:
-  case Intrinsic::x86_avx2_packssdw:
-  case Intrinsic::x86_avx2_packsswb:
-  case Intrinsic::x86_avx512_packssdw_512:
-  case Intrinsic::x86_avx512_packsswb_512:
-    if (Value *V = simplifyX86pack(*II, Builder, true))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_packuswb_128:
-  case Intrinsic::x86_sse41_packusdw:
-  case Intrinsic::x86_avx2_packusdw:
-  case Intrinsic::x86_avx2_packuswb:
-  case Intrinsic::x86_avx512_packusdw_512:
-  case Intrinsic::x86_avx512_packuswb_512:
-    if (Value *V = simplifyX86pack(*II, Builder, false))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_pclmulqdq:
-  case Intrinsic::x86_pclmulqdq_256:
-  case Intrinsic::x86_pclmulqdq_512: {
-    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-      unsigned Imm = C->getZExtValue();
-
-      bool MadeChange = false;
-      Value *Arg0 = II->getArgOperand(0);
-      Value *Arg1 = II->getArgOperand(1);
-      unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
-
-      APInt UndefElts1(VWidth, 0);
-      APInt DemandedElts1 = APInt::getSplat(VWidth,
-                                            APInt(2, (Imm & 0x01) ? 2 : 1));
-      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
-                                                UndefElts1)) {
-        replaceOperand(*II, 0, V);
-        MadeChange = true;
-      }
-
-      APInt UndefElts2(VWidth, 0);
-      APInt DemandedElts2 = APInt::getSplat(VWidth,
-                                            APInt(2, (Imm & 0x10) ? 2 : 1));
-      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
-                                                UndefElts2)) {
-        replaceOperand(*II, 1, V);
-        MadeChange = true;
-      }
-
-      // If either input elements are undef, the result is zero.
-      if (DemandedElts1.isSubsetOf(UndefElts1) ||
-          DemandedElts2.isSubsetOf(UndefElts2))
-        return replaceInstUsesWith(*II,
-                                   ConstantAggregateZero::get(II->getType()));
-
-      if (MadeChange)
-        return II;
-    }
-    break;
-  }
-
-  case Intrinsic::x86_sse41_insertps:
-    if (Value *V = simplifyX86insertps(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse4a_extrq: {
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
-    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
-           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
-           VWidth1 == 16 && "Unexpected operand sizes");
-
-    // See if we're dealing with constant values.
-    Constant *C1 = dyn_cast<Constant>(Op1);
-    ConstantInt *CILength =
-        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
-           : nullptr;
-    ConstantInt *CIIndex =
-        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
-           : nullptr;
-
-    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
-    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
-    // operands and the lowest 16-bits of the second.
-    bool MadeChange = false;
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
-      replaceOperand(*II, 0, V);
-      MadeChange = true;
-    }
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
-      replaceOperand(*II, 1, V);
-      MadeChange = true;
-    }
-    if (MadeChange)
-      return II;
-    break;
-  }
-
-  case Intrinsic::x86_sse4a_extrqi: {
-    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
-    // bits of the lower 64-bits. The upper 64-bits are undefined.
-    Value *Op0 = II->getArgOperand(0);
-    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
-           "Unexpected operand size");
-
-    // See if we're dealing with constant values.
-    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
-    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
-
-    // Attempt to simplify to a constant or shuffle vector.
-    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
-    // operand.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1))
-      return replaceOperand(*II, 0, V);
-    break;
-  }
-
-  case Intrinsic::x86_sse4a_insertq: {
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
-           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
-           cast<VectorType>(Op1->getType())->getNumElements() == 2 &&
-           "Unexpected operand size");
-
-    // See if we're dealing with constant values.
-    Constant *C1 = dyn_cast<Constant>(Op1);
-    ConstantInt *CI11 =
-        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
-           : nullptr;
-
-    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
-    if (CI11) {
-      const APInt &V11 = CI11->getValue();
-      APInt Len = V11.zextOrTrunc(6);
-      APInt Idx = V11.lshr(8).zextOrTrunc(6);
-      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
-        return replaceInstUsesWith(*II, V);
-    }
-
-    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
-    // operand.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1))
-      return replaceOperand(*II, 0, V);
-    break;
-  }
-
-  case Intrinsic::x86_sse4a_insertqi: {
-    // INSERTQI: Extract lowest Length bits from lower half of second source and
-    // insert over first source starting at Index bit. The upper 64-bits are
-    // undefined.
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
-    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
-           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
-           VWidth1 == 2 && "Unexpected operand sizes");
-
-    // See if we're dealing with constant values.
-    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
-    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
-
-    // Attempt to simplify to a constant or shuffle vector.
-    if (CILength && CIIndex) {
-      APInt Len = CILength->getValue().zextOrTrunc(6);
-      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
-      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
-        return replaceInstUsesWith(*II, V);
-    }
-
-    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
-    // operands.
-    bool MadeChange = false;
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
-      replaceOperand(*II, 0, V);
-      MadeChange = true;
-    }
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
-      replaceOperand(*II, 1, V);
-      MadeChange = true;
-    }
-    if (MadeChange)
-      return II;
-    break;
-  }
-
-  case Intrinsic::x86_sse41_pblendvb:
-  case Intrinsic::x86_sse41_blendvps:
-  case Intrinsic::x86_sse41_blendvpd:
-  case Intrinsic::x86_avx_blendv_ps_256:
-  case Intrinsic::x86_avx_blendv_pd_256:
-  case Intrinsic::x86_avx2_pblendvb: {
-    // fold (blend A, A, Mask) -> A
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    Value *Mask = II->getArgOperand(2);
-    if (Op0 == Op1)
-      return replaceInstUsesWith(CI, Op0);
-
-    // Zero Mask - select 1st argument.
-    if (isa<ConstantAggregateZero>(Mask))
-      return replaceInstUsesWith(CI, Op0);
-
-    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
-    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
-      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
-      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
-    }
-
-    // Convert to a vector select if we can bypass casts and find a boolean
-    // vector condition value.
-    Value *BoolVec;
-    Mask = peekThroughBitcast(Mask);
-    if (match(Mask, m_SExt(m_Value(BoolVec))) &&
-        BoolVec->getType()->isVectorTy() &&
-        BoolVec->getType()->getScalarSizeInBits() == 1) {
-      assert(Mask->getType()->getPrimitiveSizeInBits() ==
-             II->getType()->getPrimitiveSizeInBits() &&
-             "Not expecting mask and operands with different sizes");
-
-      unsigned NumMaskElts =
-          cast<VectorType>(Mask->getType())->getNumElements();
-      unsigned NumOperandElts =
-          cast<VectorType>(II->getType())->getNumElements();
-      if (NumMaskElts == NumOperandElts)
-        return SelectInst::Create(BoolVec, Op1, Op0);
-
-      // If the mask has less elements than the operands, each mask bit maps to
-      // multiple elements of the operands. Bitcast back and forth.
-      if (NumMaskElts < NumOperandElts) {
-        Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType());
-        Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType());
-        Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
-        return new BitCastInst(Sel, II->getType());
-      }
-    }
-
-    break;
-  }
-
-  case Intrinsic::x86_ssse3_pshuf_b_128:
-  case Intrinsic::x86_avx2_pshuf_b:
-  case Intrinsic::x86_avx512_pshuf_b_512:
-    if (Value *V = simplifyX86pshufb(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_avx_vpermilvar_ps:
-  case Intrinsic::x86_avx_vpermilvar_ps_256:
-  case Intrinsic::x86_avx512_vpermilvar_ps_512:
-  case Intrinsic::x86_avx_vpermilvar_pd:
-  case Intrinsic::x86_avx_vpermilvar_pd_256:
-  case Intrinsic::x86_avx512_vpermilvar_pd_512:
-    if (Value *V = simplifyX86vpermilvar(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_avx2_permd:
-  case Intrinsic::x86_avx2_permps:
-  case Intrinsic::x86_avx512_permvar_df_256:
-  case Intrinsic::x86_avx512_permvar_df_512:
-  case Intrinsic::x86_avx512_permvar_di_256:
-  case Intrinsic::x86_avx512_permvar_di_512:
-  case Intrinsic::x86_avx512_permvar_hi_128:
-  case Intrinsic::x86_avx512_permvar_hi_256:
-  case Intrinsic::x86_avx512_permvar_hi_512:
-  case Intrinsic::x86_avx512_permvar_qi_128:
-  case Intrinsic::x86_avx512_permvar_qi_256:
-  case Intrinsic::x86_avx512_permvar_qi_512:
-  case Intrinsic::x86_avx512_permvar_sf_512:
-  case Intrinsic::x86_avx512_permvar_si_512:
-    if (Value *V = simplifyX86vpermv(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_avx_maskload_ps:
-  case Intrinsic::x86_avx_maskload_pd:
-  case Intrinsic::x86_avx_maskload_ps_256:
-  case Intrinsic::x86_avx_maskload_pd_256:
-  case Intrinsic::x86_avx2_maskload_d:
-  case Intrinsic::x86_avx2_maskload_q:
-  case Intrinsic::x86_avx2_maskload_d_256:
-  case Intrinsic::x86_avx2_maskload_q_256:
-    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
-      return I;
-    break;
-
-  case Intrinsic::x86_sse2_maskmov_dqu:
-  case Intrinsic::x86_avx_maskstore_ps:
-  case Intrinsic::x86_avx_maskstore_pd:
-  case Intrinsic::x86_avx_maskstore_ps_256:
-  case Intrinsic::x86_avx_maskstore_pd_256:
-  case Intrinsic::x86_avx2_maskstore_d:
-  case Intrinsic::x86_avx2_maskstore_q:
-  case Intrinsic::x86_avx2_maskstore_d_256:
-  case Intrinsic::x86_avx2_maskstore_q_256:
-    if (simplifyX86MaskedStore(*II, *this))
-      return nullptr;
-    break;
-
-  case Intrinsic::x86_addcarry_32:
-  case Intrinsic::x86_addcarry_64:
-    if (Value *V = simplifyX86addcarry(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::ppc_altivec_vperm:
-    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
-    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
-    // a vectorshuffle for little endian, we must undo the transformation
-    // performed on vec_perm in altivec.h.  That is, we must complement
-    // the permutation mask with respect to 31 and reverse the order of
-    // V1 and V2.
-    if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
-      assert(cast<VectorType>(Mask->getType())->getNumElements() == 16 &&
-             "Bad type for intrinsic!");
-
-      // Check that all of the elements are integer constants or undefs.
-      bool AllEltsOk = true;
-      for (unsigned i = 0; i != 16; ++i) {
-        Constant *Elt = Mask->getAggregateElement(i);
-        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
-          AllEltsOk = false;
-          break;
-        }
-      }
-
-      if (AllEltsOk) {
-        // Cast the input vectors to byte vectors.
-        Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
-                                           Mask->getType());
-        Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
-                                           Mask->getType());
-        Value *Result = UndefValue::get(Op0->getType());
-
-        // Only extract each element once.
-        Value *ExtractedElts[32];
-        memset(ExtractedElts, 0, sizeof(ExtractedElts));
-
-        for (unsigned i = 0; i != 16; ++i) {
-          if (isa<UndefValue>(Mask->getAggregateElement(i)))
-            continue;
-          unsigned Idx =
-            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
-          Idx &= 31;  // Match the hardware behavior.
-          if (DL.isLittleEndian())
-            Idx = 31 - Idx;
-
-          if (!ExtractedElts[Idx]) {
-            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
-            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
-            ExtractedElts[Idx] =
-              Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
-                                           Builder.getInt32(Idx&15));
-          }
-
-          // Insert this value into the result vector.
-          Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
-                                               Builder.getInt32(i));
-        }
-        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
-      }
-    }
-    break;
-
-  case Intrinsic::arm_neon_vld1: {
-    Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
-    if (Value *V = simplifyNeonVld1(*II, MemAlign.value(), Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-  }
-
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane:
-  case Intrinsic::arm_neon_vst1:
-  case Intrinsic::arm_neon_vst2:
-  case Intrinsic::arm_neon_vst3:
-  case Intrinsic::arm_neon_vst4:
-  case Intrinsic::arm_neon_vst2lane:
-  case Intrinsic::arm_neon_vst3lane:
-  case Intrinsic::arm_neon_vst4lane: {
-    Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
-    unsigned AlignArg = II->getNumArgOperands() - 1;
-    Value *AlignArgOp = II->getArgOperand(AlignArg);
-    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
-    if (Align && *Align < MemAlign)
-      return replaceOperand(*II, AlignArg,
-                            ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                             MemAlign.value(), false));
-    break;
-  }
 
   case Intrinsic::arm_neon_vtbl1:
   case Intrinsic::aarch64_neon_tbl1:
@@ -3453,690 +1326,6 @@
     }
     break;
   }
-  case Intrinsic::arm_mve_pred_i2v: {
-    Value *Arg = II->getArgOperand(0);
-    Value *ArgArg;
-    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) &&
-        II->getType() == ArgArg->getType())
-      return replaceInstUsesWith(*II, ArgArg);
-    Constant *XorMask;
-    if (match(Arg,
-              m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)),
-                    m_Constant(XorMask))) &&
-        II->getType() == ArgArg->getType()) {
-      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
-        if (CI->getValue().trunc(16).isAllOnesValue()) {
-          auto TrueVector = Builder.CreateVectorSplat(
-              cast<VectorType>(II->getType())->getNumElements(),
-              Builder.getTrue());
-          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
-        }
-      }
-    }
-    KnownBits ScalarKnown(32);
-    if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16),
-                             ScalarKnown, 0))
-      return II;
-    break;
-  }
-  case Intrinsic::arm_mve_pred_v2i: {
-    Value *Arg = II->getArgOperand(0);
-    Value *ArgArg;
-    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg))))
-      return replaceInstUsesWith(*II, ArgArg);
-    if (!II->getMetadata(LLVMContext::MD_range)) {
-      Type *IntTy32 = Type::getInt32Ty(II->getContext());
-      Metadata *M[] = {
-        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
-        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))
-      };
-      II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M));
-      return II;
-    }
-    break;
-  }
-  case Intrinsic::arm_mve_vadc:
-  case Intrinsic::arm_mve_vadc_predicated: {
-    unsigned CarryOp =
-        (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
-    assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
-           "Bad type for intrinsic!");
-
-    KnownBits CarryKnown(32);
-    if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
-                             CarryKnown))
-      return II;
-    break;
-  }
-  case Intrinsic::amdgcn_rcp: {
-    Value *Src = II->getArgOperand(0);
-
-    // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src)) {
-      Type *Ty = II->getType();
-      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
-      return replaceInstUsesWith(CI, QNaN);
-    }
-
-    if (II->isStrictFP())
-      break;
-
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
-      const APFloat &ArgVal = C->getValueAPF();
-      APFloat Val(ArgVal.getSemantics(), 1);
-      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
-
-      // This is more precise than the instruction may give.
-      //
-      // TODO: The instruction always flushes denormal results (except for f16),
-      // should this also?
-      return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_rsq: {
-    Value *Src = II->getArgOperand(0);
-
-    // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src)) {
-      Type *Ty = II->getType();
-      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
-      return replaceInstUsesWith(CI, QNaN);
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_frexp_mant:
-  case Intrinsic::amdgcn_frexp_exp: {
-    Value *Src = II->getArgOperand(0);
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
-      int Exp;
-      APFloat Significand = frexp(C->getValueAPF(), Exp,
-                                  APFloat::rmNearestTiesToEven);
-
-      if (IID == Intrinsic::amdgcn_frexp_mant) {
-        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
-                                                       Significand));
-      }
-
-      // Match instruction special case behavior.
-      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
-        Exp = 0;
-
-      return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
-    }
-
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
-
-    break;
-  }
-  case Intrinsic::amdgcn_class: {
-    enum  {
-      S_NAN = 1 << 0,        // Signaling NaN
-      Q_NAN = 1 << 1,        // Quiet NaN
-      N_INFINITY = 1 << 2,   // Negative infinity
-      N_NORMAL = 1 << 3,     // Negative normal
-      N_SUBNORMAL = 1 << 4,  // Negative subnormal
-      N_ZERO = 1 << 5,       // Negative zero
-      P_ZERO = 1 << 6,       // Positive zero
-      P_SUBNORMAL = 1 << 7,  // Positive subnormal
-      P_NORMAL = 1 << 8,     // Positive normal
-      P_INFINITY = 1 << 9    // Positive infinity
-    };
-
-    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
-      N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
-
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
-    if (!CMask) {
-      if (isa<UndefValue>(Src0))
-        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-      if (isa<UndefValue>(Src1))
-        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
-      break;
-    }
-
-    uint32_t Mask = CMask->getZExtValue();
-
-    // If all tests are made, it doesn't matter what the value is.
-    if ((Mask & FullMask) == FullMask)
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
-
-    if ((Mask & FullMask) == 0)
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
-
-    if (Mask == (S_NAN | Q_NAN)) {
-      // Equivalent of isnan. Replace with standard fcmp.
-      Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
-      FCmp->takeName(II);
-      return replaceInstUsesWith(*II, FCmp);
-    }
-
-    if (Mask == (N_ZERO | P_ZERO)) {
-      // Equivalent of == 0.
-      Value *FCmp = Builder.CreateFCmpOEQ(
-        Src0, ConstantFP::get(Src0->getType(), 0.0));
-
-      FCmp->takeName(II);
-      return replaceInstUsesWith(*II, FCmp);
-    }
-
-    // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
-    if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI))
-      return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(),
-                                                     Mask & ~(S_NAN | Q_NAN)));
-
-    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
-    if (!CVal) {
-      if (isa<UndefValue>(Src0))
-        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-      // Clamp mask to used bits
-      if ((Mask & FullMask) != Mask) {
-        CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
-          { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
-        );
-
-        NewCall->takeName(II);
-        return replaceInstUsesWith(*II, NewCall);
-      }
-
-      break;
-    }
-
-    const APFloat &Val = CVal->getValueAPF();
-
-    bool Result =
-      ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
-      ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
-      ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
-      ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
-      ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
-      ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
-      ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
-      ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
-      ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
-      ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
-
-    return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
-  }
-  case Intrinsic::amdgcn_cvt_pkrtz: {
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
-      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
-        const fltSemantics &HalfSem
-          = II->getType()->getScalarType()->getFltSemantics();
-        bool LosesInfo;
-        APFloat Val0 = C0->getValueAPF();
-        APFloat Val1 = C1->getValueAPF();
-        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
-        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
-
-        Constant *Folded = ConstantVector::get({
-            ConstantFP::get(II->getContext(), Val0),
-            ConstantFP::get(II->getContext(), Val1) });
-        return replaceInstUsesWith(*II, Folded);
-      }
-    }
-
-    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
-      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-    break;
-  }
-  case Intrinsic::amdgcn_cvt_pknorm_i16:
-  case Intrinsic::amdgcn_cvt_pknorm_u16:
-  case Intrinsic::amdgcn_cvt_pk_i16:
-  case Intrinsic::amdgcn_cvt_pk_u16: {
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-
-    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
-      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-    break;
-  }
-  case Intrinsic::amdgcn_ubfe:
-  case Intrinsic::amdgcn_sbfe: {
-    // Decompose simple cases into standard shifts.
-    Value *Src = II->getArgOperand(0);
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(*II, Src);
-
-    unsigned Width;
-    Type *Ty = II->getType();
-    unsigned IntSize = Ty->getIntegerBitWidth();
-
-    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
-    if (CWidth) {
-      Width = CWidth->getZExtValue();
-      if ((Width & (IntSize - 1)) == 0)
-        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
-
-      // Hardware ignores high bits, so remove those.
-      if (Width >= IntSize)
-        return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(),
-                                                       Width & (IntSize - 1)));
-    }
-
-    unsigned Offset;
-    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
-    if (COffset) {
-      Offset = COffset->getZExtValue();
-      if (Offset >= IntSize)
-        return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(),
-                                                       Offset & (IntSize - 1)));
-    }
-
-    bool Signed = IID == Intrinsic::amdgcn_sbfe;
-
-    if (!CWidth || !COffset)
-      break;
-
-    // The case of Width == 0 is handled above, which makes this tranformation
-    // safe.  If Width == 0, then the ashr and lshr instructions become poison
-    // value since the shift amount would be equal to the bit size.
-    assert(Width != 0);
-
-    // TODO: This allows folding to undef when the hardware has specific
-    // behavior?
-    if (Offset + Width < IntSize) {
-      Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
-      Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
-                                 : Builder.CreateLShr(Shl, IntSize - Width);
-      RightShift->takeName(II);
-      return replaceInstUsesWith(*II, RightShift);
-    }
-
-    Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
-                               : Builder.CreateLShr(Src, Offset);
-
-    RightShift->takeName(II);
-    return replaceInstUsesWith(*II, RightShift);
-  }
-  case Intrinsic::amdgcn_exp:
-  case Intrinsic::amdgcn_exp_compr: {
-    ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1));
-    unsigned EnBits = En->getZExtValue();
-    if (EnBits == 0xf)
-      break; // All inputs enabled.
-
-    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
-    bool Changed = false;
-    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
-      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
-          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
-        Value *Src = II->getArgOperand(I + 2);
-        if (!isa<UndefValue>(Src)) {
-          replaceOperand(*II, I + 2, UndefValue::get(Src->getType()));
-          Changed = true;
-        }
-      }
-    }
-
-    if (Changed)
-      return II;
-
-    break;
-  }
-  case Intrinsic::amdgcn_fmed3: {
-    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
-    // for the shader.
-
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-    Value *Src2 = II->getArgOperand(2);
-
-    // Checking for NaN before canonicalization provides better fidelity when
-    // mapping other operations onto fmed3 since the order of operands is
-    // unchanged.
-    CallInst *NewCall = nullptr;
-    if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
-      NewCall = Builder.CreateMinNum(Src1, Src2);
-    } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
-      NewCall = Builder.CreateMinNum(Src0, Src2);
-    } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
-      NewCall = Builder.CreateMaxNum(Src0, Src1);
-    }
-
-    if (NewCall) {
-      NewCall->copyFastMathFlags(II);
-      NewCall->takeName(II);
-      return replaceInstUsesWith(*II, NewCall);
-    }
-
-    bool Swap = false;
-    // Canonicalize constants to RHS operands.
-    //
-    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
-    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
-      std::swap(Src0, Src1);
-      Swap = true;
-    }
-
-    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
-      std::swap(Src1, Src2);
-      Swap = true;
-    }
-
-    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
-      std::swap(Src0, Src1);
-      Swap = true;
-    }
-
-    if (Swap) {
-      II->setArgOperand(0, Src0);
-      II->setArgOperand(1, Src1);
-      II->setArgOperand(2, Src2);
-      return II;
-    }
-
-    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
-      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
-        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
-          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
-                                       C2->getValueAPF());
-          return replaceInstUsesWith(*II,
-            ConstantFP::get(Builder.getContext(), Result));
-        }
-      }
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_icmp:
-  case Intrinsic::amdgcn_fcmp: {
-    const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2));
-    // Guard against invalid arguments.
-    int64_t CCVal = CC->getZExtValue();
-    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
-    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
-                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
-        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
-                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
-      break;
-
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-
-    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
-      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
-        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
-        if (CCmp->isNullValue()) {
-          return replaceInstUsesWith(
-              *II, ConstantExpr::getSExt(CCmp, II->getType()));
-        }
-
-        // The result of V_ICMP/V_FCMP assembly instructions (which this
-        // intrinsic exposes) is one bit per thread, masked with the EXEC
-        // register (which contains the bitmask of live threads). So a
-        // comparison that always returns true is the same as a read of the
-        // EXEC register.
-        Function *NewF = Intrinsic::getDeclaration(
-            II->getModule(), Intrinsic::read_register, II->getType());
-        Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
-        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
-        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
-        CallInst *NewCall = Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
-        NewCall->takeName(II);
-        return replaceInstUsesWith(*II, NewCall);
-      }
-
-      // Canonicalize constants to RHS.
-      CmpInst::Predicate SwapPred
-        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
-      II->setArgOperand(0, Src1);
-      II->setArgOperand(1, Src0);
-      II->setArgOperand(2, ConstantInt::get(CC->getType(),
-                                            static_cast<int>(SwapPred)));
-      return II;
-    }
-
-    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
-      break;
-
-    // Canonicalize compare eq with true value to compare != 0
-    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
-    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
-    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
-    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
-    Value *ExtSrc;
-    if (CCVal == CmpInst::ICMP_EQ &&
-        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
-         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
-        ExtSrc->getType()->isIntegerTy(1)) {
-      replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType()));
-      replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
-      return II;
-    }
-
-    CmpInst::Predicate SrcPred;
-    Value *SrcLHS;
-    Value *SrcRHS;
-
-    // Fold compare eq/ne with 0 from a compare result as the predicate to the
-    // intrinsic. The typical use is a wave vote function in the library, which
-    // will be fed from a user code condition compared with 0. Fold in the
-    // redundant compare.
-
-    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
-    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
-    //
-    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
-    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
-    if (match(Src1, m_Zero()) &&
-        match(Src0,
-              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
-      if (CCVal == CmpInst::ICMP_EQ)
-        SrcPred = CmpInst::getInversePredicate(SrcPred);
-
-      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
-        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
-
-      Type *Ty = SrcLHS->getType();
-      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
-        // Promote to next legal integer type.
-        unsigned Width = CmpType->getBitWidth();
-        unsigned NewWidth = Width;
-
-        // Don't do anything for i1 comparisons.
-        if (Width == 1)
-          break;
-
-        if (Width <= 16)
-          NewWidth = 16;
-        else if (Width <= 32)
-          NewWidth = 32;
-        else if (Width <= 64)
-          NewWidth = 64;
-        else if (Width > 64)
-          break; // Can't handle this.
-
-        if (Width != NewWidth) {
-          IntegerType *CmpTy = Builder.getIntNTy(NewWidth);
-          if (CmpInst::isSigned(SrcPred)) {
-            SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy);
-            SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy);
-          } else {
-            SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy);
-            SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy);
-          }
-        }
-      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
-        break;
-
-      Function *NewF =
-          Intrinsic::getDeclaration(II->getModule(), NewIID,
-                                    { II->getType(),
-                                      SrcLHS->getType() });
-      Value *Args[] = { SrcLHS, SrcRHS,
-                        ConstantInt::get(CC->getType(), SrcPred) };
-      CallInst *NewCall = Builder.CreateCall(NewF, Args);
-      NewCall->takeName(II);
-      return replaceInstUsesWith(*II, NewCall);
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_ballot: {
-    if (auto *Src = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return replaceInstUsesWith(*II, Constant::getNullValue(II->getType()));
-      }
-
-      if (Src->isOne()) {
-        // amdgcn.ballot(i1 1) is exec.
-        const char *RegName = "exec";
-        if (II->getType()->isIntegerTy(32))
-          RegName = "exec_lo";
-        else if (!II->getType()->isIntegerTy(64))
-          break;
-
-        Function *NewF = Intrinsic::getDeclaration(
-            II->getModule(), Intrinsic::read_register, II->getType());
-        Metadata *MDArgs[] = {MDString::get(II->getContext(), RegName)};
-        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
-        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
-        CallInst *NewCall = Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
-        NewCall->takeName(II);
-        return replaceInstUsesWith(*II, NewCall);
-      }
-    }
-    break;
-  }
-  case Intrinsic::amdgcn_wqm_vote: {
-    // wqm_vote is identity when the argument is constant.
-    if (!isa<Constant>(II->getArgOperand(0)))
-      break;
-
-    return replaceInstUsesWith(*II, II->getArgOperand(0));
-  }
-  case Intrinsic::amdgcn_kill: {
-    const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
-    if (!C || !C->getZExtValue())
-      break;
-
-    // amdgcn.kill(i1 1) is a no-op
-    return eraseInstFromFunction(CI);
-  }
-  case Intrinsic::amdgcn_update_dpp: {
-    Value *Old = II->getArgOperand(0);
-
-    auto BC = cast<ConstantInt>(II->getArgOperand(5));
-    auto RM = cast<ConstantInt>(II->getArgOperand(3));
-    auto BM = cast<ConstantInt>(II->getArgOperand(4));
-    if (BC->isZeroValue() ||
-        RM->getZExtValue() != 0xF ||
-        BM->getZExtValue() != 0xF ||
-        isa<UndefValue>(Old))
-      break;
-
-    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
-    return replaceOperand(*II, 0, UndefValue::get(Old->getType()));
-  }
-  case Intrinsic::amdgcn_permlane16:
-  case Intrinsic::amdgcn_permlanex16: {
-    // Discard vdst_in if it's not going to be read.
-    Value *VDstIn = II->getArgOperand(0);
-   if (isa<UndefValue>(VDstIn))
-     break;
-
-    ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4));
-    ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5));
-    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
-      break;
-
-    return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType()));
-  }
-  case Intrinsic::amdgcn_readfirstlane:
-  case Intrinsic::amdgcn_readlane: {
-    // A constant value is trivially uniform.
-    if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0)))
-      return replaceInstUsesWith(*II, C);
-
-    // The rest of these may not be safe if the exec may not be the same between
-    // the def and use.
-    Value *Src = II->getArgOperand(0);
-    Instruction *SrcInst = dyn_cast<Instruction>(Src);
-    if (SrcInst && SrcInst->getParent() != II->getParent())
-      break;
-
-    // readfirstlane (readfirstlane x) -> readfirstlane x
-    // readlane (readfirstlane x), y -> readfirstlane x
-    if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>()))
-      return replaceInstUsesWith(*II, Src);
-
-    if (IID == Intrinsic::amdgcn_readfirstlane) {
-      // readfirstlane (readlane x, y) -> readlane x, y
-      if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>()))
-        return replaceInstUsesWith(*II, Src);
-    } else {
-      // readlane (readlane x, y), y -> readlane x, y
-      if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>(
-                  m_Value(), m_Specific(II->getArgOperand(1)))))
-        return replaceInstUsesWith(*II, Src);
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_ldexp: {
-    // FIXME: This doesn't introduce new instructions and belongs in
-    // InstructionSimplify.
-    Type *Ty = II->getType();
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-
-    // Folding undef to qnan is safe regardless of the FP mode.
-    if (isa<UndefValue>(Op0)) {
-      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
-      return replaceInstUsesWith(*II, QNaN);
-    }
-
-    const APFloat *C = nullptr;
-    match(Op0, m_APFloat(C));
-
-    // FIXME: Should flush denorms depending on FP mode, but that's ignored
-    // everywhere else.
-    //
-    // These cases should be safe, even with strictfp.
-    // ldexp(0.0, x) -> 0.0
-    // ldexp(-0.0, x) -> -0.0
-    // ldexp(inf, x) -> inf
-    // ldexp(-inf, x) -> -inf
-    if (C && (C->isZero() || C->isInfinity()))
-      return replaceInstUsesWith(*II, Op0);
-
-    // With strictfp, be more careful about possibly needing to flush denormals
-    // or not, and snan behavior depends on ieee_mode.
-    if (II->isStrictFP())
-      break;
-
-    if (C && C->isNaN()) {
-      // FIXME: We just need to make the nan quiet here, but that's unavailable
-      // on APFloat, only IEEEfloat
-      auto *Quieted = ConstantFP::get(
-        Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
-      return replaceInstUsesWith(*II, Quieted);
-    }
-
-    // ldexp(x, 0) -> x
-    // ldexp(x, undef) -> x
-    if (isa<UndefValue>(Op1) || match(Op1, m_ZeroInt()))
-      return replaceInstUsesWith(*II, Op0);
-
-    break;
-  }
   case Intrinsic::hexagon_V6_vandvrt:
   case Intrinsic::hexagon_V6_vandvrt_128B: {
     // Simplify Q -> V -> Q conversion.
@@ -4360,12 +1549,19 @@
     }
     break;
   }
+  default: {
+    // Handle target specific intrinsics
+    Optional<Instruction *> V = targetInstCombineIntrinsic(*II);
+    if (V.hasValue())
+      return V.getValue();
+    break;
+  }
   }
   return visitCallBase(*II);
 }
 
 // Fence instruction simplification
-Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
+Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
   // Remove identical consecutive fences.
   Instruction *Next = FI.getNextNonDebugInstruction();
   if (auto *NFI = dyn_cast<FenceInst>(Next))
@@ -4375,12 +1571,12 @@
 }
 
 // InvokeInst simplification
-Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+Instruction *InstCombinerImpl::visitInvokeInst(InvokeInst &II) {
   return visitCallBase(II);
 }
 
 // CallBrInst simplification
-Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) {
+Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) {
   return visitCallBase(CBI);
 }
 
@@ -4420,7 +1616,7 @@
   return true;
 }
 
-Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
+Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
   if (!CI->getCalledFunction()) return nullptr;
 
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
@@ -4577,7 +1773,7 @@
 }
 
 /// Improvements for call, callbr and invoke instructions.
-Instruction *InstCombiner::visitCallBase(CallBase &Call) {
+Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
   if (isAllocationFn(&Call, &TLI))
     annotateAnyAllocSite(Call, &TLI);
 
@@ -4728,7 +1924,7 @@
 
 /// If the callee is a constexpr cast of a function, attempt to move the cast to
 /// the arguments of the call/callbr/invoke.
-bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
+bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
   auto *Callee =
       dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
   if (!Callee)
@@ -5012,8 +2208,8 @@
 /// Turn a call to a function created by init_trampoline / adjust_trampoline
 /// intrinsic pair into a direct call to the underlying function.
 Instruction *
-InstCombiner::transformCallThroughTrampoline(CallBase &Call,
-                                             IntrinsicInst &Tramp) {
+InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
+                                                 IntrinsicInst &Tramp) {
   Value *Callee = Call.getCalledOperand();
   Type *CalleeTy = Callee->getType();
   FunctionType *FTy = Call.getFunctionType();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -14,10 +14,11 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <numeric>
 using namespace llvm;
 using namespace PatternMatch;
@@ -81,8 +82,8 @@
 
 /// If we find a cast of an allocation instruction, try to eliminate the cast by
 /// moving the type information into the alloc.
-Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
-                                                   AllocaInst &AI) {
+Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
+                                                       AllocaInst &AI) {
   PointerType *PTy = cast<PointerType>(CI.getType());
 
   IRBuilderBase::InsertPointGuard Guard(Builder);
@@ -160,8 +161,8 @@
 
 /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
 /// true for, actually insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
-                                             bool isSigned) {
+Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
+                                                 bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
     // If we got a constantexpr back, try to simplify it with DL info.
@@ -229,8 +230,9 @@
   return InsertNewInstWith(Res, *I);
 }
 
-Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
-                                                        const CastInst *CI2) {
+Instruction::CastOps
+InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
+                                       const CastInst *CI2) {
   Type *SrcTy = CI1->getSrcTy();
   Type *MidTy = CI1->getDestTy();
   Type *DstTy = CI2->getDestTy();
@@ -257,7 +259,7 @@
 }
 
 /// Implement the transforms common to all CastInst visitors.
-Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
 
   // Try to eliminate a cast of a cast.
@@ -342,7 +344,7 @@
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
                                  Instruction *CxtI) {
   if (canAlwaysEvaluateInType(V, Ty))
     return true;
@@ -459,7 +461,8 @@
 ///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
 ///   --->
 ///   extractelement <4 x i32> %X, 1
-static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
+                                         InstCombinerImpl &IC) {
   Value *TruncOp = Trunc.getOperand(0);
   Type *DestType = Trunc.getType();
   if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
@@ -498,7 +501,7 @@
 
 /// Rotate left/right may occur in a wider type than necessary because of type
 /// promotion rules. Try to narrow the inputs and convert to funnel shift.
-Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::narrowRotate(TruncInst &Trunc) {
   assert((isa<VectorType>(Trunc.getSrcTy()) ||
           shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
          "Don't narrow to an illegal scalar type");
@@ -582,7 +585,7 @@
 /// Try to narrow the width of math or bitwise logic instructions by pulling a
 /// truncate ahead of binary operators.
 /// TODO: Transforms for truncated shifts should be moved into here.
-Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
   Type *SrcTy = Trunc.getSrcTy();
   Type *DestTy = Trunc.getType();
   if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
@@ -687,7 +690,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   if (Instruction *Result = commonCastTransforms(Trunc))
     return Result;
 
@@ -894,8 +897,8 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
-                                             bool DoTransform) {
+Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
+                                                 bool DoTransform) {
   // If we are just checking for a icmp eq of a single bit and zext'ing it
   // to an integer, then shift the bit to the appropriate place and then
   // cast to integer to avoid the comparison.
@@ -1031,7 +1034,7 @@
 ///
 /// This function works on both vectors and scalars.
 static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
-                             InstCombiner &IC, Instruction *CxtI) {
+                             InstCombinerImpl &IC, Instruction *CxtI) {
   BitsToClear = 0;
   if (canAlwaysEvaluateInType(V, Ty))
     return true;
@@ -1136,7 +1139,7 @@
   }
 }
 
-Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
   // If this zero extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@@ -1274,7 +1277,8 @@
 }
 
 /// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
-Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
+Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI,
+                                                 Instruction &CI) {
   Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
   ICmpInst::Predicate Pred = ICI->getPredicate();
 
@@ -1410,7 +1414,7 @@
   return false;
 }
 
-Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
   // If this sign extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this sext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@@ -1497,7 +1501,6 @@
   return nullptr;
 }
 
-
 /// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
 static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
@@ -1616,7 +1619,7 @@
   return false;
 }
 
-Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
+Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   if (Instruction *I = commonCastTransforms(FPT))
     return I;
 
@@ -1800,7 +1803,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
+Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
   // If the source operand is a cast from integer to FP and known exact, then
   // cast the integer operand directly to the destination type.
   Type *Ty = FPExt.getType();
@@ -1818,7 +1821,7 @@
 /// This is safe if the intermediate type has enough bits in its mantissa to
 /// accurately represent all values of X.  For example, this won't work with
 /// i64 -> float -> i64.
-Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
+Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
   if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
     return nullptr;
 
@@ -1858,29 +1861,29 @@
   return replaceInstUsesWith(FI, X);
 }
 
-Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
   return commonCastTransforms(FI);
 }
 
-Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
   return commonCastTransforms(FI);
 }
 
-Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
@@ -1903,7 +1906,7 @@
 }
 
 /// Implement the transforms for cast of pointer (bitcast/ptrtoint)
-Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
 
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
@@ -1925,7 +1928,7 @@
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
@@ -1963,9 +1966,9 @@
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
-                                                            VectorType *DestTy,
-                                                            InstCombiner &IC) {
+static Instruction *
+optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
+                                        InstCombinerImpl &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
@@ -2165,7 +2168,7 @@
 ///
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
-                                                InstCombiner &IC) {
+                                                InstCombinerImpl &IC) {
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
@@ -2194,7 +2197,7 @@
 /// vectors better than bitcasts of scalars because vector registers are
 /// usually not type-specific like scalar integer or scalar floating-point.
 static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
-                                              InstCombiner &IC) {
+                                              InstCombinerImpl &IC) {
   // TODO: Create and use a pattern matcher for ExtractElementInst.
   auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
   if (!ExtElt || !ExtElt->hasOneUse())
@@ -2320,7 +2323,8 @@
 ///
 /// All the related PHI nodes can be replaced by new PHI nodes with type A.
 /// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
-Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
+Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
+                                                      PHINode *PN) {
   // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
   if (hasStoreUsersOnly(CI))
     return nullptr;
@@ -2484,7 +2488,7 @@
   return RetVal;
 }
 
-Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
   Value *Src = CI.getOperand(0);
@@ -2666,7 +2670,7 @@
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
   // If the destination pointer element type is not the same as the source's
   // first do a bitcast to the destination type, and then the addrspacecast.
   // This allows the cast to be exposed to other transforms.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -142,10 +143,10 @@
 ///
 /// If AndCst is non-null, then the loaded value is masked with that constant
 /// before doing the comparison. This handles cases like "A[i]&4 == 0".
-Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
-                                                        GlobalVariable *GV,
-                                                        CmpInst &ICI,
-                                                        ConstantInt *AndCst) {
+Instruction *
+InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                               GlobalVariable *GV, CmpInst &ICI,
+                                               ConstantInt *AndCst) {
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
     return nullptr;
@@ -422,7 +423,7 @@
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
 ///
-static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
+static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
                                           const DataLayout &DL) {
   gep_type_iterator GTI = gep_type_begin(GEP);
 
@@ -841,9 +842,9 @@
 
 /// Fold comparisons between a GEP instruction and something else. At this point
 /// we know that the GEP is on the LHS of the comparison.
-Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
-                                       ICmpInst::Predicate Cond,
-                                       Instruction &I) {
+Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                                           ICmpInst::Predicate Cond,
+                                           Instruction &I) {
   // Don't transform signed compares of GEPs into index compares. Even if the
   // GEP is inbounds, the final add of the base pointer can have signed overflow
   // and would change the result of the icmp.
@@ -1021,9 +1022,9 @@
   return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
 }
 
-Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
-                                         const AllocaInst *Alloca,
-                                         const Value *Other) {
+Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
+                                             const AllocaInst *Alloca,
+                                             const Value *Other) {
   assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
 
   // It would be tempting to fold away comparisons between allocas and any
@@ -1099,8 +1100,8 @@
 }
 
 /// Fold "icmp pred (X+C), X".
-Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C,
-                                              ICmpInst::Predicate Pred) {
+Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
+                                                  ICmpInst::Predicate Pred) {
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
@@ -1149,9 +1150,9 @@
 /// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
 /// (icmp eq/ne A, Log2(AP2/AP1)) ->
 /// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
-Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
-                                                 const APInt &AP1,
-                                                 const APInt &AP2) {
+Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
+                                                     const APInt &AP1,
+                                                     const APInt &AP2) {
   assert(I.isEquality() && "Cannot fold icmp gt/lt");
 
   auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@@ -1208,9 +1209,9 @@
 
 /// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
 /// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
-Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
-                                                 const APInt &AP1,
-                                                 const APInt &AP2) {
+Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A,
+                                                     const APInt &AP1,
+                                                     const APInt &AP2) {
   assert(I.isEquality() && "Cannot fold icmp gt/lt");
 
   auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@@ -1254,7 +1255,7 @@
 ///
 static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
                                           ConstantInt *CI2, ConstantInt *CI1,
-                                          InstCombiner &IC) {
+                                          InstCombinerImpl &IC) {
   // The transformation we're trying to do here is to transform this into an
   // llvm.sadd.with.overflow.  To do this, we have to replace the original add
   // with a narrower add, and discard the add-with-constant that is part of the
@@ -1340,7 +1341,7 @@
 ///   icmp eq/ne (urem/srem %x, %y), 0
 /// iff %y is a power-of-two, we can replace this with a bit test:
 ///   icmp eq/ne (and %x, (add %y, -1)), 0
-Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
   // This fold is only valid for equality predicates.
   if (!I.isEquality())
     return nullptr;
@@ -1359,7 +1360,7 @@
 
 /// Fold equality-comparison between zero and any (maybe truncated) right-shift
 /// by one-less-than-bitwidth into a sign test on the original value.
-Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) {
   Instruction *Val;
   ICmpInst::Predicate Pred;
   if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
@@ -1390,7 +1391,7 @@
 }
 
 // Handle  icmp pred X, 0
-Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
   CmpInst::Predicate Pred = Cmp.getPredicate();
   if (!match(Cmp.getOperand(1), m_Zero()))
     return nullptr;
@@ -1431,7 +1432,7 @@
 /// should be moved to some other helper and extended as noted below (it is also
 /// possible that code has been made unnecessary - do we canonicalize IR to
 /// overflow/saturating intrinsics or not?).
-Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {
   // Match the following pattern, which is a common idiom when writing
   // overflow-safe integer arithmetic functions. The source performs an addition
   // in wider type and explicitly checks for overflow using comparisons against
@@ -1477,7 +1478,7 @@
 }
 
 /// Canonicalize icmp instructions based on dominating conditions.
-Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
   // This is a cheap/incomplete check for dominance - just match a single
   // predecessor with a conditional branch.
   BasicBlock *CmpBB = Cmp.getParent();
@@ -1547,9 +1548,9 @@
 }
 
 /// Fold icmp (trunc X, Y), C.
-Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
-                                                 TruncInst *Trunc,
-                                                 const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
+                                                     TruncInst *Trunc,
+                                                     const APInt &C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *X = Trunc->getOperand(0);
   if (C.isOneValue() && C.getBitWidth() > 1) {
@@ -1580,9 +1581,9 @@
 }
 
 /// Fold icmp (xor X, Y), C.
-Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Xor,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpXorConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Xor,
+                                                   const APInt &C) {
   Value *X = Xor->getOperand(0);
   Value *Y = Xor->getOperand(1);
   const APInt *XorC;
@@ -1649,8 +1650,10 @@
 }
 
 /// Fold icmp (and (sh X, Y), C2), C1.
-Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
-                                            const APInt &C1, const APInt &C2) {
+Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
+                                                BinaryOperator *And,
+                                                const APInt &C1,
+                                                const APInt &C2) {
   BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
   if (!Shift || !Shift->isShift())
     return nullptr;
@@ -1733,9 +1736,9 @@
 }
 
 /// Fold icmp (and X, C2), C1.
-Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
-                                                 BinaryOperator *And,
-                                                 const APInt &C1) {
+Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
+                                                     BinaryOperator *And,
+                                                     const APInt &C1) {
   bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;
 
   // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
@@ -1841,9 +1844,9 @@
 }
 
 /// Fold icmp (and X, Y), C.
-Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
-                                               BinaryOperator *And,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *And,
+                                                   const APInt &C) {
   if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
     return I;
 
@@ -1895,8 +1898,9 @@
 }
 
 /// Fold icmp (or X, Y), C.
-Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
-                                              const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
+                                                  BinaryOperator *Or,
+                                                  const APInt &C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   if (C.isOneValue()) {
     // icmp slt signum(V) 1 --> icmp slt V, 1
@@ -1960,9 +1964,9 @@
 }
 
 /// Fold icmp (mul X, Y), C.
-Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Mul,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Mul,
+                                                   const APInt &C) {
   const APInt *MulC;
   if (!match(Mul->getOperand(1), m_APInt(MulC)))
     return nullptr;
@@ -2043,9 +2047,9 @@
 }
 
 /// Fold icmp (shl X, Y), C.
-Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Shl,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Shl,
+                                                   const APInt &C) {
   const APInt *ShiftVal;
   if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
     return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
@@ -2183,9 +2187,9 @@
 }
 
 /// Fold icmp ({al}shr X, Y), C.
-Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Shr,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Shr,
+                                                   const APInt &C) {
   // An exact shr only shifts out zero bits, so:
   // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
   Value *X = Shr->getOperand(0);
@@ -2276,9 +2280,9 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
-                                                BinaryOperator *SRem,
-                                                const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
+                                                    BinaryOperator *SRem,
+                                                    const APInt &C) {
   // Match an 'is positive' or 'is negative' comparison of remainder by a
   // constant power-of-2 value:
   // (X % pow2C) sgt/slt 0
@@ -2315,9 +2319,9 @@
 }
 
 /// Fold icmp (udiv X, Y), C.
-Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
-                                                BinaryOperator *UDiv,
-                                                const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
+                                                    BinaryOperator *UDiv,
+                                                    const APInt &C) {
   const APInt *C2;
   if (!match(UDiv->getOperand(0), m_APInt(C2)))
     return nullptr;
@@ -2344,9 +2348,9 @@
 }
 
 /// Fold icmp ({su}div X, Y), C.
-Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Div,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Div,
+                                                   const APInt &C) {
   // Fold: icmp pred ([us]div X, C2), C -> range test
   // Fold this div into the comparison, producing a range check.
   // Determine, based on the divide type, what the range is being
@@ -2514,9 +2518,9 @@
 }
 
 /// Fold icmp (sub X, Y), C.
-Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Sub,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Sub,
+                                                   const APInt &C) {
   Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   const APInt *C2;
@@ -2576,9 +2580,9 @@
 }
 
 /// Fold icmp (add X, Y), C.
-Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Add,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Add,
+                                                   const APInt &C) {
   Value *Y = Add->getOperand(1);
   const APInt *C2;
   if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
@@ -2642,10 +2646,10 @@
   return nullptr;
 }
 
-bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
-                                           Value *&RHS, ConstantInt *&Less,
-                                           ConstantInt *&Equal,
-                                           ConstantInt *&Greater) {
+bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
+                                               Value *&RHS, ConstantInt *&Less,
+                                               ConstantInt *&Equal,
+                                               ConstantInt *&Greater) {
   // TODO: Generalize this to work with other comparison idioms or ensure
   // they get canonicalized into this form.
 
@@ -2682,7 +2686,8 @@
   if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
     // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
     auto FlippedStrictness =
-        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
+        InstCombiner::getFlippedStrictnessPredicateAndConstant(
+            PredB, cast<Constant>(RHS2));
     if (!FlippedStrictness)
       return false;
     assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
@@ -2694,9 +2699,9 @@
   return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
 }
 
-Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
-                                                  SelectInst *Select,
-                                                  ConstantInt *C) {
+Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
+                                                      SelectInst *Select,
+                                                      ConstantInt *C) {
 
   assert(C && "Cmp RHS should be a constant int!");
   // If we're testing a constant value against the result of a three way
@@ -2794,7 +2799,7 @@
     const APInt *C;
     bool TrueIfSigned;
     if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
-        isSignBitCheck(Pred, *C, TrueIfSigned)) {
+        InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) {
       if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
           match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
         // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
@@ -2870,7 +2875,7 @@
 
 /// Try to fold integer comparisons with a constant operand: icmp Pred X, C
 /// where X is some kind of instruction.
-Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) {
   const APInt *C;
   if (!match(Cmp.getOperand(1), m_APInt(C)))
     return nullptr;
@@ -2955,9 +2960,8 @@
 
 /// Fold an icmp equality instruction with binary operator LHS and constant RHS:
 /// icmp eq/ne BO, C.
-Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
-                                                             BinaryOperator *BO,
-                                                             const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
+    ICmpInst &Cmp, BinaryOperator *BO, const APInt &C) {
   // TODO: Some of these folds could work with arbitrary constants, but this
   // function is limited to scalar and vector splat constants.
   if (!Cmp.isEquality())
@@ -3072,9 +3076,8 @@
 }
 
 /// Fold an equality icmp with LLVM intrinsic and constant operand.
-Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
-                                                           IntrinsicInst *II,
-                                                           const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
+    ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) {
   Type *Ty = II->getType();
   unsigned BitWidth = C.getBitWidth();
   switch (II->getIntrinsicID()) {
@@ -3145,9 +3148,9 @@
 }
 
 /// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
-Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
-                                                         IntrinsicInst *II,
-                                                         const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
+                                                             IntrinsicInst *II,
+                                                             const APInt &C) {
   if (Cmp.isEquality())
     return foldICmpEqIntrinsicWithConstant(Cmp, II, C);
 
@@ -3204,7 +3207,7 @@
 }
 
 /// Handle icmp with constant (but not simple integer constant) RHS.
-Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Constant *RHSC = dyn_cast<Constant>(Op1);
   Instruction *LHSI = dyn_cast<Instruction>(Op0);
@@ -3650,7 +3653,7 @@
 ///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
 /// Note that the comparison is commutative, while inverted (u>=, ==) predicate
 /// will mean that we are looking for the opposite answer.
-Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
+Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
   ICmpInst::Predicate Pred;
   Value *X, *Y;
   Instruction *Mul;
@@ -3716,7 +3719,8 @@
 /// TODO: A large part of this logic is duplicated in InstSimplify's
 /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
 /// duplication.
-Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
+Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
+                                             const SimplifyQuery &SQ) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -4170,7 +4174,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   if (!I.isEquality())
     return nullptr;
 
@@ -4438,7 +4442,7 @@
 }
 
 /// Handle icmp (cast x), (cast or constant).
-Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) {
+Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
   auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
   if (!CastOp0)
     return nullptr;
@@ -4493,9 +4497,10 @@
   }
 }
 
-OverflowResult InstCombiner::computeOverflow(
-    Instruction::BinaryOps BinaryOp, bool IsSigned,
-    Value *LHS, Value *RHS, Instruction *CxtI) const {
+OverflowResult
+InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp,
+                                  bool IsSigned, Value *LHS, Value *RHS,
+                                  Instruction *CxtI) const {
   switch (BinaryOp) {
     default:
       llvm_unreachable("Unsupported binary op");
@@ -4517,9 +4522,11 @@
   }
 }
 
-bool InstCombiner::OptimizeOverflowCheck(
-    Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS,
-    Instruction &OrigI, Value *&Result, Constant *&Overflow) {
+bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp,
+                                             bool IsSigned, Value *LHS,
+                                             Value *RHS, Instruction &OrigI,
+                                             Value *&Result,
+                                             Constant *&Overflow) {
   if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
     std::swap(LHS, RHS);
 
@@ -4575,7 +4582,8 @@
 /// \returns Instruction which must replace the compare instruction, NULL if no
 ///          replacement required.
 static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
-                                         Value *OtherVal, InstCombiner &IC) {
+                                         Value *OtherVal,
+                                         InstCombinerImpl &IC) {
   // Don't bother doing this transformation for pointers, don't do it for
   // vectors.
   if (!isa<IntegerType>(MulVal->getType()))
@@ -4723,7 +4731,7 @@
   Function *F = Intrinsic::getDeclaration(
       I.getModule(), Intrinsic::umul_with_overflow, MulType);
   CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
-  IC.Worklist.push(MulInstr);
+  IC.addToWorklist(MulInstr);
 
   // If there are uses of mul result other than the comparison, we know that
   // they are truncation or binary AND. Change them to use result of
@@ -4750,11 +4758,11 @@
       } else {
         llvm_unreachable("Unexpected Binary operation");
       }
-      IC.Worklist.push(cast<Instruction>(U));
+      IC.addToWorklist(cast<Instruction>(U));
     }
   }
   if (isa<Instruction>(OtherVal))
-    IC.Worklist.push(cast<Instruction>(OtherVal));
+    IC.addToWorklist(cast<Instruction>(OtherVal));
 
   // The original icmp gets replaced with the overflow value, maybe inverted
   // depending on predicate.
@@ -4799,7 +4807,7 @@
   // If this is a normal comparison, it demands all bits. If it is a sign bit
   // comparison, it only demands the sign bit.
   bool UnusedBit;
-  if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
+  if (InstCombiner::isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
     return APInt::getSignMask(BitWidth);
 
   switch (I.getPredicate()) {
@@ -4856,9 +4864,9 @@
 /// \return true when \p UI is the only use of \p DI in the parent block
 /// and all other uses of \p DI are in blocks dominated by \p DB.
 ///
-bool InstCombiner::dominatesAllUses(const Instruction *DI,
-                                    const Instruction *UI,
-                                    const BasicBlock *DB) const {
+bool InstCombinerImpl::dominatesAllUses(const Instruction *DI,
+                                        const Instruction *UI,
+                                        const BasicBlock *DB) const {
   assert(DI && UI && "Instruction not defined\n");
   // Ignore incomplete definitions.
   if (!DI->getParent())
@@ -4931,9 +4939,9 @@
 /// major restriction since a NE compare should be 'normalized' to an equal
 /// compare, which usually happens in the combiner and test case
 /// select-cmp-br.ll checks for it.
-bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
-                                             const ICmpInst *Icmp,
-                                             const unsigned SIOpd) {
+bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
+                                                 const ICmpInst *Icmp,
+                                                 const unsigned SIOpd) {
   assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
   if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
     BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
@@ -4959,7 +4967,7 @@
 
 /// Try to fold the comparison based on range information we can get by checking
 /// whether bits are known to be zero or one in the inputs.
-Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = Op0->getType();
   ICmpInst::Predicate Pred = I.getPredicate();
@@ -5186,8 +5194,8 @@
 }
 
 llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
-llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
-                                               Constant *C) {
+InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
+                                                       Constant *C) {
   assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
          "Only for relational integer predicates.");
 
@@ -5256,7 +5264,7 @@
 static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   ICmpInst::Predicate Pred = I.getPredicate();
   if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
-      isCanonicalPredicate(Pred))
+      InstCombiner::isCanonicalPredicate(Pred))
     return nullptr;
 
   Value *Op0 = I.getOperand(0);
@@ -5265,7 +5273,8 @@
   if (!Op1C)
     return nullptr;
 
-  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  auto FlippedStrictness =
+      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
   if (!FlippedStrictness)
     return nullptr;
 
@@ -5277,11 +5286,11 @@
 static CmpInst *canonicalizeICmpPredicate(CmpInst &I) {
   // Is the predicate already canonical?
   CmpInst::Predicate Pred = I.getPredicate();
-  if (isCanonicalPredicate(Pred))
+  if (InstCombiner::isCanonicalPredicate(Pred))
     return nullptr;
 
   // Can all users be adjusted to predicate inversion?
-  if (!canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
+  if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
     return nullptr;
 
   // Ok, we can canonicalize comparison!
@@ -5510,7 +5519,7 @@
   return ExtractValueInst::Create(UAddOv, 1);
 }
 
-Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -5748,8 +5757,9 @@
 }
 
 /// Fold fcmp ([us]itofp x, cst) if possible.
-Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
-                                                Constant *RHSC) {
+Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
+                                                    Instruction *LHSI,
+                                                    Constant *RHSC) {
   if (!isa<ConstantFP>(RHSC)) return nullptr;
   const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
 
@@ -6034,7 +6044,7 @@
 }
 
 /// Optimize fabs(X) compared with zero.
-static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) {
+static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
   Value *X;
   if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
       !match(I.getOperand(1), m_PosZeroFP()))
@@ -6096,7 +6106,7 @@
   }
 }
 
-Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
 
   /// Orders the operands of the compare so that they are listed from most
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -15,40 +15,32 @@
 #ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
-#include <cstdint>
 
 #define DEBUG_TYPE "instcombine"
 
 using namespace llvm::PatternMatch;
 
+// As a default, let's assume that we want to be aggressive,
+// and attempt to traverse with no limits in attempt to sink negation.
+static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
+
+// Let's guesstimate that most often we will end up visiting/producing
+// fairly small number of new instructions.
+static constexpr unsigned NegatorMaxNodesSSO = 16;
+
 namespace llvm {
 
 class AAResults;
@@ -65,305 +57,26 @@
 class TargetLibraryInfo;
 class User;
 
-/// Assign a complexity or rank value to LLVM Values. This is used to reduce
-/// the amount of pattern matching needed for compares and commutative
-/// instructions. For example, if we have:
-///   icmp ugt X, Constant
-/// or
-///   xor (add X, Constant), cast Z
-///
-/// We do not have to consider the commuted variants of these patterns because
-/// canonicalization based on complexity guarantees the above ordering.
-///
-/// This routine maps IR values to various complexity ranks:
-///   0 -> undef
-///   1 -> Constants
-///   2 -> Other non-instructions
-///   3 -> Arguments
-///   4 -> Cast and (f)neg/not instructions
-///   5 -> Other instructions
-static inline unsigned getComplexity(Value *V) {
-  if (isa<Instruction>(V)) {
-    if (isa<CastInst>(V) || match(V, m_Neg(m_Value())) ||
-        match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value())))
-      return 4;
-    return 5;
-  }
-  if (isa<Argument>(V))
-    return 3;
-  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
-}
-
-/// Predicate canonicalization reduces the number of patterns that need to be
-/// matched by other transforms. For example, we may swap the operands of a
-/// conditional branch or select to create a compare with a canonical (inverted)
-/// predicate which is then more likely to be matched with other values.
-static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
-  switch (Pred) {
-  case CmpInst::ICMP_NE:
-  case CmpInst::ICMP_ULE:
-  case CmpInst::ICMP_SLE:
-  case CmpInst::ICMP_UGE:
-  case CmpInst::ICMP_SGE:
-  // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
-  case CmpInst::FCMP_ONE:
-  case CmpInst::FCMP_OLE:
-  case CmpInst::FCMP_OGE:
-    return false;
-  default:
-    return true;
-  }
-}
-
-/// Given an exploded icmp instruction, return true if the comparison only
-/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
-/// result of the comparison is true when the input value is signed.
-inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
-                           bool &TrueIfSigned) {
-  switch (Pred) {
-  case ICmpInst::ICMP_SLT: // True if LHS s< 0
-    TrueIfSigned = true;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_SLE: // True if LHS s<= -1
-    TrueIfSigned = true;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGT: // True if LHS s> -1
-    TrueIfSigned = false;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGE: // True if LHS s>= 0
-    TrueIfSigned = false;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_UGT:
-    // True if LHS u> RHS and RHS == sign-bit-mask - 1
-    TrueIfSigned = true;
-    return RHS.isMaxSignedValue();
-  case ICmpInst::ICMP_UGE:
-    // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = true;
-    return RHS.isMinSignedValue();
-  case ICmpInst::ICMP_ULT:
-    // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = false;
-    return RHS.isMinSignedValue();
-  case ICmpInst::ICMP_ULE:
-    // True if LHS u<= RHS and RHS == sign-bit-mask - 1
-    TrueIfSigned = false;
-    return RHS.isMaxSignedValue();
-  default:
-    return false;
-  }
-}
-
-llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
-getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C);
-
-/// Return the source operand of a potentially bitcasted value while optionally
-/// checking if it has one use. If there is no bitcast or the one use check is
-/// not met, return the input value itself.
-static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
-  if (auto *BitCast = dyn_cast<BitCastInst>(V))
-    if (!OneUseOnly || BitCast->hasOneUse())
-      return BitCast->getOperand(0);
-
-  // V is not a bitcast or V has more than one use and OneUseOnly is true.
-  return V;
-}
-
-/// Add one to a Constant
-static inline Constant *AddOne(Constant *C) {
-  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// Subtract one from a Constant
-static inline Constant *SubOne(Constant *C) {
-  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// Return true if the specified value is free to invert (apply ~ to).
-/// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
-/// is true, work under the assumption that the caller intends to remove all
-/// uses of V and only keep uses of ~V.
-///
-/// See also: canFreelyInvertAllUsersOf()
-static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
-  // ~(~(X)) -> X.
-  if (match(V, m_Not(m_Value())))
-    return true;
-
-  // Constants can be considered to be not'ed values.
-  if (match(V, m_AnyIntegralConstant()))
-    return true;
-
-  // Compares can be inverted if all of their uses are being modified to use the
-  // ~V.
-  if (isa<CmpInst>(V))
-    return WillInvertAllUses;
-
-  // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1
-  // - Constant) - A` if we are willing to invert all of the uses.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
-    if (BO->getOpcode() == Instruction::Add ||
-        BO->getOpcode() == Instruction::Sub)
-      if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1)))
-        return WillInvertAllUses;
-
-  // Selects with invertible operands are freely invertible
-  if (match(V, m_Select(m_Value(), m_Not(m_Value()), m_Not(m_Value()))))
-    return WillInvertAllUses;
-
-  return false;
-}
-
-/// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
-/// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this fn.
-///
-/// See also: isFreeToInvert()
-static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
-  // Look at every user of V.
-  for (Use &U : V->uses()) {
-    if (U.getUser() == IgnoredUser)
-      continue; // Don't consider this user.
-
-    auto *I = cast<Instruction>(U.getUser());
-    switch (I->getOpcode()) {
-    case Instruction::Select:
-      if (U.getOperandNo() != 0) // Only if the value is used as select cond.
-        return false;
-      break;
-    case Instruction::Br:
-      assert(U.getOperandNo() == 0 && "Must be branching on that value.");
-      break; // Free to invert by swapping true/false values/destinations.
-    case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it.
-      if (!match(I, m_Not(m_Value())))
-        return false; // Not a 'not'.
-      break;
-    default:
-      return false; // Don't know, likely not freely invertible.
-    }
-    // So far all users were free to invert...
-  }
-  return true; // Can freely invert all users!
-}
-
-/// Some binary operators require special handling to avoid poison and undefined
-/// behavior. If a constant vector has undef elements, replace those undefs with
-/// identity constants if possible because those are always safe to execute.
-/// If no identity constant exists, replace undef with some other safe constant.
-static inline Constant *getSafeVectorConstantForBinop(
-      BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) {
-  auto *InVTy = dyn_cast<VectorType>(In->getType());
-  assert(InVTy && "Not expecting scalars here");
-
-  Type *EltTy = InVTy->getElementType();
-  auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
-  if (!SafeC) {
-    // TODO: Should this be available as a constant utility function? It is
-    // similar to getBinOpAbsorber().
-    if (IsRHSConstant) {
-      switch (Opcode) {
-      case Instruction::SRem: // X % 1 = 0
-      case Instruction::URem: // X %u 1 = 0
-        SafeC = ConstantInt::get(EltTy, 1);
-        break;
-      case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
-        SafeC = ConstantFP::get(EltTy, 1.0);
-        break;
-      default:
-        llvm_unreachable("Only rem opcodes have no identity constant for RHS");
-      }
-    } else {
-      switch (Opcode) {
-      case Instruction::Shl:  // 0 << X = 0
-      case Instruction::LShr: // 0 >>u X = 0
-      case Instruction::AShr: // 0 >> X = 0
-      case Instruction::SDiv: // 0 / X = 0
-      case Instruction::UDiv: // 0 /u X = 0
-      case Instruction::SRem: // 0 % X = 0
-      case Instruction::URem: // 0 %u X = 0
-      case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
-      case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
-      case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
-      case Instruction::FRem: // 0.0 % X = 0
-        SafeC = Constant::getNullValue(EltTy);
-        break;
-      default:
-        llvm_unreachable("Expected to find identity constant for opcode");
-      }
-    }
-  }
-  assert(SafeC && "Must have safe constant for binop");
-  unsigned NumElts = InVTy->getNumElements();
-  SmallVector<Constant *, 16> Out(NumElts);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *C = In->getAggregateElement(i);
-    Out[i] = isa<UndefValue>(C) ? SafeC : C;
-  }
-  return ConstantVector::get(Out);
-}
-
-/// The core instruction combiner logic.
-///
-/// This class provides both the logic to recursively visit instructions and
-/// combine them.
-class LLVM_LIBRARY_VISIBILITY InstCombiner
-    : public InstVisitor<InstCombiner, Instruction *> {
-  // FIXME: These members shouldn't be public.
+class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
+    : public InstCombiner,
+      public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
-  /// A worklist of the instructions that need to be simplified.
-  InstCombineWorklist &Worklist;
-
-  /// An IRBuilder that automatically inserts new instructions into the
-  /// worklist.
-  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
-  BuilderTy &Builder;
-
-private:
-  // Mode in which we are running the combiner.
-  const bool MinimizeSize;
+  InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder,
+                   bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+                   TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+                   DominatorTree &DT, OptimizationRemarkEmitter &ORE,
+                   BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+                   const DataLayout &DL, LoopInfo *LI)
+      : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
+                     BFI, PSI, DL, LI) {}
 
-  AAResults *AA;
-
-  // Required analyses.
-  AssumptionCache &AC;
-  TargetLibraryInfo &TLI;
-  DominatorTree &DT;
-  const DataLayout &DL;
-  const SimplifyQuery SQ;
-  OptimizationRemarkEmitter &ORE;
-  BlockFrequencyInfo *BFI;
-  ProfileSummaryInfo *PSI;
-
-  // Optional analyses. When non-null, these can both be used to do better
-  // combining and will be updated to reflect any changes.
-  LoopInfo *LI;
-
-  bool MadeIRChange = false;
-
-public:
-  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, AAResults *AA,
-               AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
-               OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-               ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
-      : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        AA(AA), AC(AC), TLI(TLI), DT(DT),
-        DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
+  virtual ~InstCombinerImpl() {}
 
   /// Run the combiner over the entire worklist until it is empty.
   ///
   /// \returns true if the IR is changed.
   bool run();
 
-  AssumptionCache &getAssumptionCache() const { return AC; }
-
-  const DataLayout &getDataLayout() const { return DL; }
-
-  DominatorTree &getDominatorTree() const { return DT; }
-
-  LoopInfo *getLoopInfo() const { return LI; }
-
-  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
-
   // Visitation implementation - Implement instruction combining for different
   // instruction types.  The semantics are as follows:
   // Return Value:
@@ -726,7 +439,7 @@
   /// When dealing with an instruction that has side effects or produces a void
   /// value, we can't rely on DCE to delete the instruction. Instead, visit
   /// methods should return the value returned by this function.
-  Instruction *eraseInstFromFunction(Instruction &I) {
+  Instruction *eraseInstFromFunction(Instruction &I) override {
     LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     salvageDebugInfo(I);
@@ -808,10 +521,6 @@
       Instruction::BinaryOps BinaryOp, bool IsSigned,
       Value *LHS, Value *RHS, Instruction *CxtI) const;
 
-  /// Maximum size of array considered when transforming.
-  uint64_t MaxArraySizeForCombine = 0;
-
-private:
   /// Performs a few simplifications for operators which are associative
   /// or commutative.
   bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
@@ -857,7 +566,7 @@
                                  unsigned Depth, Instruction *CxtI);
   bool SimplifyDemandedBits(Instruction *I, unsigned Op,
                             const APInt &DemandedMask, KnownBits &Known,
-                            unsigned Depth = 0);
+                            unsigned Depth = 0) override;
 
   /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
   /// bits. It also tries to handle simplifications that can be done based on
@@ -877,13 +586,10 @@
   /// demanded bits.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
 
-  Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
-                                               APInt DemandedElts,
-                                               int DmaskIdx = -1);
-
-  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt &UndefElts, unsigned Depth = 0,
-                                    bool AllowMultipleUsers = false);
+  virtual Value *
+  SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
+                             unsigned Depth = 0,
+                             bool AllowMultipleUsers = false) override;
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
@@ -1023,18 +729,6 @@
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
-namespace {
-
-// As a default, let's assume that we want to be aggressive,
-// and attempt to traverse with no limits in attempt to sink negation.
-static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
-
-// Let's guesstimate that most often we will end up visiting/producing
-// fairly small number of new instructions.
-static constexpr unsigned NegatorMaxNodesSSO = 16;
-
-} // namespace
-
 class Negator final {
   /// Top-to-bottom, def-to-use negated instruction tree we produced.
   SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
@@ -1078,7 +772,7 @@
   /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
   /// otherwise returns negated value.
   LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
-                                      InstCombiner &IC);
+                                      InstCombinerImpl &IC);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -166,7 +167,8 @@
                                             APInt(64, AllocaSize), DL);
 }
 
-static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
+static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
+                                            AllocaInst &AI) {
   // Check for array size of 1 (scalar allocation).
   if (!AI.isArrayAllocation()) {
     // i32 1 is the canonical array size for scalar allocations.
@@ -234,7 +236,7 @@
 // instruction.
 class PointerReplacer {
 public:
-  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
   void replacePointer(Instruction &I, Value *V);
 
 private:
@@ -244,7 +246,7 @@
 
   SmallVector<Instruction *, 4> Path;
   MapVector<Value *, Value *> WorkMap;
-  InstCombiner &IC;
+  InstCombinerImpl &IC;
 };
 } // end anonymous namespace
 
@@ -323,7 +325,7 @@
   findLoadAndReplace(I);
 }
 
-Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI))
     return I;
 
@@ -421,9 +423,9 @@
 /// that pointer type, load it, etc.
 ///
 /// Note that this will create all of the instructions with whatever insert
-/// point the \c InstCombiner currently is using.
-LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
-                                             const Twine &Suffix) {
+/// point the \c InstCombinerImpl currently is using.
+LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                                 const Twine &Suffix) {
   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
          "can't fold an atomic load to requested type");
 
@@ -445,7 +447,8 @@
 /// Combine a store to a new type.
 ///
 /// Returns the newly created store instruction.
-static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
+static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
+                                         Value *V) {
   assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
          "can't fold an atomic store of requested type");
 
@@ -502,7 +505,7 @@
 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
   assert(V->getType()->isPointerTy() && "Expected pointer type.");
   // Ignore possible ty* to ixx* bitcast.
-  V = peekThroughBitcast(V);
+  V = InstCombiner::peekThroughBitcast(V);
   // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
   // pattern.
   CmpInst::Predicate Pred;
@@ -537,7 +540,8 @@
 /// or a volatile load. This is debatable, and might be reasonable to change
 /// later. However, it is risky in case some backend or other part of LLVM is
 /// relying on the exact type loaded to select appropriate atomic operations.
-static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
+static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
+                                               LoadInst &LI) {
   // FIXME: We could probably with some care handle both volatile and ordered
   // atomic loads here but it isn't clear that this is important.
   if (!LI.isUnordered())
@@ -563,9 +567,9 @@
   if (!Ty->isIntegerTy() && Ty->isSized() && !isa<ScalableVectorType>(Ty) &&
       DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
       DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) &&
-      !isMinMaxWithLoads(
-          peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true),
-          Dummy)) {
+      !isMinMaxWithLoads(InstCombiner::peekThroughBitcast(
+                             LI.getPointerOperand(), /*OneUseOnly=*/true),
+                         Dummy)) {
     if (all_of(LI.users(), [&LI](User *U) {
           auto *SI = dyn_cast<StoreInst>(U);
           return SI && SI->getPointerOperand() != &LI &&
@@ -605,7 +609,7 @@
   return nullptr;
 }
 
-static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
+static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
   // FIXME: We could probably with some care handle both volatile and atomic
   // stores here but it isn't clear that this is important.
   if (!LI.isSimple())
@@ -804,8 +808,9 @@
 // not zero. Currently, we only handle the first such index. Also, we could
 // also search through non-zero constant indices if we kept track of the
 // offsets those indices implied.
-static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
-                                     Instruction *MemI, unsigned &Idx) {
+static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
+                                     GetElementPtrInst *GEPI, Instruction *MemI,
+                                     unsigned &Idx) {
   if (GEPI->getNumOperands() < 2)
     return false;
 
@@ -874,7 +879,7 @@
 // access, but the object has only one element, we can assume that the index
 // will always be zero. If we replace the GEP, return it.
 template <typename T>
-static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
+static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
                                           T &MemI) {
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
     unsigned Idx;
@@ -916,7 +921,7 @@
   return false;
 }
 
-Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
   // Try to canonicalize the loaded type.
@@ -1033,7 +1038,7 @@
 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
 /// Note that %U may contain non-undef values where %V1 has undef.
-static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
+static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
   Value *U = nullptr;
   while (auto *IV = dyn_cast<InsertValueInst>(V)) {
     auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
@@ -1094,7 +1099,7 @@
 /// the caller must erase the store instruction. We have to let the caller erase
 /// the store instruction as otherwise there is no way to signal whether it was
 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
-static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
+static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and ordered
   // atomic stores here but it isn't clear that this is important.
   if (!SI.isUnordered())
@@ -1126,7 +1131,7 @@
   return false;
 }
 
-static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
+static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and atomic
   // stores here but it isn't clear that this is important.
   if (!SI.isSimple())
@@ -1266,7 +1271,7 @@
 /// Converts store (bitcast (load (bitcast (select ...)))) to
 /// store (load (select ...)), where select is minmax:
 /// select ((cmp load V1, load V2), V1, V2).
-static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
+static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
                                                 StoreInst &SI) {
   // bitcast?
   if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
@@ -1296,7 +1301,8 @@
   if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
         auto *SI = dyn_cast<StoreInst>(U);
         return SI && SI->getPointerOperand() != LI &&
-               peekThroughBitcast(SI->getPointerOperand()) != LoadAddr &&
+               InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
+                   LoadAddr &&
                !SI->getPointerOperand()->isSwiftError();
       }))
     return false;
@@ -1314,7 +1320,7 @@
   return true;
 }
 
-Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
   Value *Val = SI.getOperand(0);
   Value *Ptr = SI.getOperand(1);
 
@@ -1433,7 +1439,7 @@
 /// or:
 ///   *P = v1; if () { *P = v2; }
 /// into a phi node with a store in the successor.
-bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
+bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   if (!SI.isUnordered())
     return false; // This code has not been audited for volatile/ordered case.
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <cassert>
 #include <cstddef>
@@ -46,7 +47,7 @@
 /// The specific integer value is used in a context where it is known to be
 /// non-zero.  If this allows us to simplify the computation, do so and return
 /// the new operand, otherwise return null.
-static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
+static Value *simplifyValueKnownNonZero(Value *V, InstCombinerImpl &IC,
                                         Instruction &CxtI) {
   // If V has multiple uses, then we would have to do more analysis to determine
   // if this is safe.  For example, the use could be in dynamically unreached
@@ -171,7 +172,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -423,7 +424,7 @@
   return Changed ? &I : nullptr;
 }
 
-Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
   BinaryOperator::BinaryOps Opcode = I.getOpcode();
   assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
          "Expected fmul or fdiv");
@@ -457,7 +458,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
   if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -637,7 +638,7 @@
 /// Fold a divide or remainder with a select instruction divisor when one of the
 /// select operands is zero. In that case, we can use the other select operand
 /// because div/rem by zero is undefined.
-bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
+bool InstCombinerImpl::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
   SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
   if (!SI)
     return false;
@@ -738,7 +739,7 @@
 /// instructions (udiv and sdiv). It is called by the visitors to those integer
 /// division instructions.
 /// Common integer divide transforms
-Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   bool IsSigned = I.getOpcode() == Instruction::SDiv;
   Type *Ty = I.getType();
@@ -874,7 +875,7 @@
 
 using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
                                            const BinaryOperator &I,
-                                           InstCombiner &IC);
+                                           InstCombinerImpl &IC);
 
 /// Used to maintain state for visitUDivOperand().
 struct UDivFoldAction {
@@ -903,7 +904,8 @@
 
 // X udiv 2^C -> X >> C
 static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
-                                    const BinaryOperator &I, InstCombiner &IC) {
+                                    const BinaryOperator &I,
+                                    InstCombinerImpl &IC) {
   Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1));
   if (!C1)
     llvm_unreachable("Failed to constant fold udiv -> logbase2");
@@ -916,7 +918,7 @@
 // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
 // X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
 static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
-                                InstCombiner &IC) {
+                                InstCombinerImpl &IC) {
   Value *ShiftLeft;
   if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
     ShiftLeft = Op1;
@@ -1010,7 +1012,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
   if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1104,7 +1106,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
   if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1265,7 +1267,7 @@
   return BinaryOperator::CreateFDivFMF(NewC, X, &I);
 }
 
-Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -1372,7 +1374,7 @@
 /// instructions (urem and srem). It is called by the visitors to those integer
 /// remainder instructions.
 /// Common integer remainder transforms
-Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
@@ -1410,7 +1412,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
   if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1461,7 +1463,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
   if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1484,7 +1486,7 @@
   // -X srem Y --> -(X srem Y)
   Value *X, *Y;
   if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
-    return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y)); 
+    return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y));
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
   // unsigned inputs), turn this into a urem.
@@ -1533,7 +1535,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
   if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <functional>
 #include <tuple>
 #include <type_traits>
@@ -430,7 +431,7 @@
 }
 
 LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
-                                      InstCombiner &IC) {
+                                      InstCombinerImpl &IC) {
   ++NegatorTotalNegationsAttempted;
   LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
                     << "\n");
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -30,7 +31,7 @@
 /// The PHI arguments will be folded into a single operation with a PHI node
 /// as input. The debug location of the single operation will be the merged
 /// locations of the original PHI node arguments.
-void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
+void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
   auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
   Inst->setDebugLoc(FirstInst->getDebugLoc());
   // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc
@@ -93,7 +94,7 @@
 //    ptr_val_inc = ...
 //    ...
 //
-Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldIntegerTypedPHI(PHINode &PN) {
   if (!PN.getType()->isIntegerTy())
     return nullptr;
   if (!PN.hasOneUse())
@@ -292,7 +293,7 @@
 
 /// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
 /// adds all have a single use, turn this into a phi and a single binop.
-Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
   assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
   unsigned Opc = FirstInst->getOpcode();
@@ -385,7 +386,7 @@
   return NewBinOp;
 }
 
-Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
 
   SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
@@ -494,7 +495,6 @@
   return NewGEP;
 }
 
-
 /// Return true if we know that it is safe to sink the load out of the block
 /// that defines it. This means that it must be obvious the value of the load is
 /// not changed from the point of the load to the end of the block it is in.
@@ -540,7 +540,7 @@
   return true;
 }
 
-Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
 
   // FIXME: This is overconservative; this transform is allowed in some cases
@@ -654,7 +654,7 @@
 /// TODO: This function could handle other cast types, but then it might
 /// require special-casing a cast from the 'i1' type. See the comment in
 /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
-Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
+Instruction *InstCombinerImpl::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
   if (Instruction *TI = Phi.getParent()->getTerminator())
@@ -728,7 +728,7 @@
 /// If all operands to a PHI node are the same "unary" operator and they all are
 /// only used by the PHI, PHI together their inputs, and do the operation once,
 /// to the result of the PHI.
-Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
   if (Instruction *TI = PN.getParent()->getTerminator())
@@ -955,7 +955,7 @@
 /// TODO: The user of the trunc may be an bitcast to float/double/vector or an
 /// inttoptr.  We should produce new PHIs in the right type.
 ///
-Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
+Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // PHIUsers - Keep track of all of the truncated values extracted from a set
   // of PHIs, along with their offset.  These are the things we want to rewrite.
   SmallVector<PHIUsageRecord, 16> PHIUsers;
@@ -1203,7 +1203,7 @@
 
 // PHINode simplification
 //
-Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
   if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
     return replaceInstUsesWith(PN, V);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>
 
@@ -57,7 +58,7 @@
 /// constant of a binop.
 static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
                                             const TargetLibraryInfo &TLI,
-                                            InstCombiner &IC) {
+                                            InstCombinerImpl &IC) {
   // The select condition must be an equality compare with a constant operand.
   Value *X;
   Constant *C;
@@ -279,8 +280,8 @@
 }
 
 /// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
-Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
-                                          Instruction *FI) {
+Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                              Instruction *FI) {
   // Don't break up min/max patterns. The hasOneUse checks below prevent that
   // for most cases, but vector min/max with bitcasts can be transformed. If the
   // one-use restrictions are eased for other patterns, we still don't want to
@@ -418,8 +419,8 @@
 
 /// Try to fold the select into one of the operands to allow further
 /// optimization.
-Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
-                                            Value *FalseVal) {
+Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                                Value *FalseVal) {
   // See the comment above GetSelectFoldableOperands for a description of the
   // transformation we are doing here.
   if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
@@ -1024,9 +1025,9 @@
 /// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
 /// Note: if C1 != C2, this will change the icmp constant to the existing
 /// constant operand of the select.
-static Instruction *
-canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
-                               InstCombiner &IC) {
+static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
+                                                   ICmpInst &Cmp,
+                                                   InstCombinerImpl &IC) {
   if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
@@ -1070,7 +1071,7 @@
 /// Canonicalize all these variants to 1 pattern.
 /// This makes CSE more likely.
 static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
-                                        InstCombiner &IC) {
+                                        InstCombinerImpl &IC) {
   if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
@@ -1253,7 +1254,7 @@
                                   APInt::getAllOnesValue(
                                       C0->getType()->getScalarSizeInBits()))))
       return nullptr; // Can't do, have all-ones element[s].
-    C0 = AddOne(C0);
+    C0 = InstCombiner::AddOne(C0);
     std::swap(X, Sel1);
     break;
   case ICmpInst::Predicate::ICMP_UGE:
@@ -1313,7 +1314,7 @@
                                   APInt::getSignedMaxValue(
                                       C2->getType()->getScalarSizeInBits()))))
       return nullptr; // Can't do, have signed max element[s].
-    C2 = AddOne(C2);
+    C2 = InstCombiner::AddOne(C2);
     LLVM_FALLTHROUGH;
   case ICmpInst::Predicate::ICMP_SGE:
     // Also non-canonical, but here we don't need to change C2,
@@ -1360,7 +1361,7 @@
 // and swap the hands of select.
 static Instruction *
 tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
-                                         InstCombiner &IC) {
+                                         InstCombinerImpl &IC) {
   ICmpInst::Predicate Pred;
   Value *X;
   Constant *C0;
@@ -1375,7 +1376,7 @@
 
   // If comparison predicate is non-canonical, then we certainly won't be able
   // to make it canonical; canonicalizeCmpWithConstant() already tried.
-  if (!isCanonicalPredicate(Pred))
+  if (!InstCombiner::isCanonicalPredicate(Pred))
     return nullptr;
 
   // If the [input] type of comparison and select type are different, lets abort
@@ -1403,7 +1404,8 @@
     return nullptr;
 
   // Check the constant we'd have with flipped-strictness predicate.
-  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  auto FlippedStrictness =
+      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
   if (!FlippedStrictness)
     return nullptr;
 
@@ -1426,8 +1428,8 @@
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
-Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
-                                                  ICmpInst *ICI) {
+Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
+                                                      ICmpInst *ICI) {
   if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ))
     return replaceInstUsesWith(SI, V);
 
@@ -1579,11 +1581,11 @@
 
 /// We have an SPF (e.g. a min or max) of an SPF of the form:
 ///   SPF2(SPF1(A, B), C)
-Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
-                                        SelectPatternFlavor SPF1,
-                                        Value *A, Value *B,
-                                        Instruction &Outer,
-                                        SelectPatternFlavor SPF2, Value *C) {
+Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
+                                            SelectPatternFlavor SPF1, Value *A,
+                                            Value *B, Instruction &Outer,
+                                            SelectPatternFlavor SPF2,
+                                            Value *C) {
   if (Outer.getType() != Inner->getType())
     return nullptr;
 
@@ -1900,7 +1902,7 @@
   return CallInst::Create(F, {X, Y});
 }
 
-Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
   Constant *C;
   if (!match(Sel.getTrueValue(), m_Constant(C)) &&
       !match(Sel.getFalseValue(), m_Constant(C)))
@@ -2001,8 +2003,8 @@
 /// to a vector select by splatting the condition. A splat may get folded with
 /// other operations in IR and having all operands of a select be vector types
 /// is likely better for vector codegen.
-static Instruction *canonicalizeScalarSelectOfVecs(
-    SelectInst &Sel, InstCombiner &IC) {
+static Instruction *canonicalizeScalarSelectOfVecs(SelectInst &Sel,
+                                                   InstCombinerImpl &IC) {
   auto *Ty = dyn_cast<VectorType>(Sel.getType());
   if (!Ty)
     return nullptr;
@@ -2172,7 +2174,7 @@
 }
 
 /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
-Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) {
+Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
   Type *Ty = MinMax1.getType();
 
   // We are looking for a tree of:
@@ -2372,7 +2374,8 @@
   bool IsTrueIfSignSet;
   ICmpInst::Predicate Pred;
   if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
-      !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType)
+      !InstCombiner::isSignBitCheck(Pred, *C, IsTrueIfSignSet) ||
+      X->getType() != SelType)
     return nullptr;
 
   // If needed, negate the value that will be the sign argument of the copysign:
@@ -2393,7 +2396,7 @@
   return CopySign;
 }
 
-Instruction *InstCombiner::foldVectorSelect(SelectInst &Sel) {
+Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
   auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
   if (!VecTy)
     return nullptr;
@@ -2523,7 +2526,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
   Value *FalseVal = SI.getFalseValue();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -31,7 +32,7 @@
 //
 // AnalyzeForSignBitExtraction indicates that we will only analyze whether this
 // pattern has any 2 right-shifts that sum to 1 less than original bit width.
-Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
+Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
     BinaryOperator *Sh0, const SimplifyQuery &SQ,
     bool AnalyzeForSignBitExtraction) {
   // Look for a shift of some instruction, ignore zext of shift amount if any.
@@ -360,7 +361,7 @@
   return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
 }
 
-Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
 
@@ -420,8 +421,8 @@
 /// Return true if we can simplify two logical (either left or right) shifts
 /// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
 static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
-                                    Instruction *InnerShift, InstCombiner &IC,
-                                    Instruction *CxtI) {
+                                    Instruction *InnerShift,
+                                    InstCombinerImpl &IC, Instruction *CxtI) {
   assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
 
   // We need constant scalar or constant splat shifts.
@@ -472,7 +473,7 @@
 /// where the client will ask if E can be computed shifted right by 64-bits. If
 /// this succeeds, getShiftedValue() will be called to produce the value.
 static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
-                               InstCombiner &IC, Instruction *CxtI) {
+                               InstCombinerImpl &IC, Instruction *CxtI) {
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
     return true;
@@ -608,7 +609,7 @@
 /// When canEvaluateShifted() returns true for an expression, this function
 /// inserts the new computation that produces the shifted value.
 static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
-                              InstCombiner &IC, const DataLayout &DL) {
+                              InstCombinerImpl &IC, const DataLayout &DL) {
   // We can always evaluate constants shifted.
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (isLeftShift)
@@ -618,7 +619,7 @@
   }
 
   Instruction *I = cast<Instruction>(V);
-  IC.Worklist.push(I);
+  IC.addToWorklist(I);
 
   switch (I->getOpcode()) {
   default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
@@ -672,8 +673,8 @@
   }
 }
 
-Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
-                                               BinaryOperator &I) {
+Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                                   BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
 
   const APInt *Op1C;
@@ -915,7 +916,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
 
   if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
@@ -1037,7 +1038,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
   if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1167,7 +1168,7 @@
 }
 
 Instruction *
-InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
+InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract(
     BinaryOperator &OldAShr) {
   assert(OldAShr.getOpcode() == Instruction::AShr &&
          "Must be called with arithmetic right-shift instruction only.");
@@ -1235,7 +1236,7 @@
   return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
 }
 
-Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -12,29 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-namespace {
-
-struct AMDGPUImageDMaskIntrinsic {
-  unsigned Intr;
-};
-
-#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
-#include "InstCombineTables.inc"
-
-} // end anonymous namespace
-
 /// Check to see if the specified operand of the specified instruction is a
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
@@ -63,7 +52,7 @@
 
 /// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
 /// the instruction has any properties that allow us to simplify its operands.
-bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
   unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
   KnownBits Known(BitWidth);
   APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
@@ -79,22 +68,20 @@
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
-                                        const APInt &DemandedMask,
-                                        KnownBits &Known,
-                                        unsigned Depth) {
+bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                            const APInt &DemandedMask,
+                                            KnownBits &Known, unsigned Depth) {
   Use &U = I->getOperandUse(OpNo);
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
                                           Depth, I);
   if (!NewVal) return false;
   if (Instruction* OpInst = dyn_cast<Instruction>(U))
     salvageDebugInfo(*OpInst);
-    
+
   replaceUse(U, NewVal);
   return true;
 }
 
-
 /// This function attempts to replace V with a simpler value based on the
 /// demanded bits. When this function is called, it is known that only the bits
 /// set in DemandedMask of the result of V are ever used downstream.
@@ -118,9 +105,10 @@
 /// operands based on the information about what bits are demanded. This returns
 /// some other non-null value if it found out that V is equal to another value
 /// in the context where the specified bits are demanded, but not for all users.
-Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
-                                             KnownBits &Known, unsigned Depth,
-                                             Instruction *CxtI) {
+Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                                 KnownBits &Known,
+                                                 unsigned Depth,
+                                                 Instruction *CxtI) {
   assert(V != nullptr && "Null pointer of Value???");
   assert(Depth <= 6 && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
@@ -728,7 +716,6 @@
     bool KnownBitsComputed = false;
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
-      default: break;
       case Intrinsic::bswap: {
         // If the only bits demanded come from one byte of the bswap result,
         // just shift the input byte into position to eliminate the bswap.
@@ -784,39 +771,14 @@
         KnownBitsComputed = true;
         break;
       }
-      case Intrinsic::x86_mmx_pmovmskb:
-      case Intrinsic::x86_sse_movmsk_ps:
-      case Intrinsic::x86_sse2_movmsk_pd:
-      case Intrinsic::x86_sse2_pmovmskb_128:
-      case Intrinsic::x86_avx_movmsk_ps_256:
-      case Intrinsic::x86_avx_movmsk_pd_256:
-      case Intrinsic::x86_avx2_pmovmskb: {
-        // MOVMSK copies the vector elements' sign bits to the low bits
-        // and zeros the high bits.
-        unsigned ArgWidth;
-        if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
-          ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
-        } else {
-          auto Arg = II->getArgOperand(0);
-          auto ArgType = cast<VectorType>(Arg->getType());
-          ArgWidth = ArgType->getNumElements();
-        }
-
-        // If we don't need any of low bits then return zero,
-        // we know that DemandedMask is non-zero already.
-        APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
-        if (DemandedElts.isNullValue())
-          return ConstantInt::getNullValue(VTy);
-
-        // We know that the upper bits are set to zero.
-        Known.Zero.setBitsFrom(ArgWidth);
-        KnownBitsComputed = true;
+      default: {
+        // Handle target specific intrinsics
+        Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
+            *II, DemandedMask, Known, KnownBitsComputed);
+        if (V.hasValue())
+          return V.getValue();
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_64:
-        Known.Zero.setBitsFrom(32);
-        KnownBitsComputed = true;
-        break;
       }
     }
 
@@ -836,11 +798,9 @@
 /// Helper routine of SimplifyDemandedUseBits. It computes Known
 /// bits. It also tries to handle simplifications that can be done based on
 /// DemandedMask, but without modifying the Instruction.
-Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
-                                                     const APInt &DemandedMask,
-                                                     KnownBits &Known,
-                                                     unsigned Depth,
-                                                     Instruction *CxtI) {
+Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
+    Instruction *I, const APInt &DemandedMask, KnownBits &Known, unsigned Depth,
+    Instruction *CxtI) {
   unsigned BitWidth = DemandedMask.getBitWidth();
   Type *ITy = I->getType();
 
@@ -940,7 +900,6 @@
   return nullptr;
 }
 
-
 /// Helper routine of SimplifyDemandedUseBits. It tries to simplify
 /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
 /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
@@ -958,11 +917,9 @@
 ///
 /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
 /// not successful.
-Value *
-InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
-                                         Instruction *Shl, const APInt &ShlOp1,
-                                         const APInt &DemandedMask,
-                                         KnownBits &Known) {
+Value *InstCombinerImpl::simplifyShrShlDemandedBits(
+    Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+    const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known) {
   if (!ShlOp1 || !ShrOp1)
     return nullptr; // No-op.
 
@@ -1022,153 +979,6 @@
   return nullptr;
 }
 
-/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
-///
-/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
-///       struct returns.
-Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
-                                                           APInt DemandedElts,
-                                                           int DMaskIdx) {
-
-  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
-  if (DMaskIdx < 0 &&
-      II->getType()->getScalarSizeInBits() != 32 &&
-      DemandedElts.getActiveBits() == 3)
-    return nullptr;
-
-  auto *IIVTy = cast<VectorType>(II->getType());
-  unsigned VWidth = IIVTy->getNumElements();
-  if (VWidth == 1)
-    return nullptr;
-
-  IRBuilderBase::InsertPointGuard Guard(Builder);
-  Builder.SetInsertPoint(II);
-
-  // Assume the arguments are unchanged and later override them, if needed.
-  SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
-
-  if (DMaskIdx < 0) {
-    // Buffer case.
-
-    const unsigned ActiveBits = DemandedElts.getActiveBits();
-    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
-
-    // Start assuming the prefix of elements is demanded, but possibly clear
-    // some other bits if there are trailing zeros (unused components at front)
-    // and update offset.
-    DemandedElts = (1 << ActiveBits) - 1;
-
-    if (UnusedComponentsAtFront > 0) {
-      static const unsigned InvalidOffsetIdx = 0xf;
-
-      unsigned OffsetIdx;
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::amdgcn_raw_buffer_load:
-        OffsetIdx = 1;
-        break;
-      case Intrinsic::amdgcn_s_buffer_load:
-        // If resulting type is vec3, there is no point in trimming the
-        // load with updated offset, as the vec3 would most likely be widened to
-        // vec4 anyway during lowering.
-        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
-          OffsetIdx = InvalidOffsetIdx;
-        else
-          OffsetIdx = 1;
-        break;
-      case Intrinsic::amdgcn_struct_buffer_load:
-        OffsetIdx = 2;
-        break;
-      default:
-        // TODO: handle tbuffer* intrinsics.
-        OffsetIdx = InvalidOffsetIdx;
-        break;
-      }
-
-      if (OffsetIdx != InvalidOffsetIdx) {
-        // Clear demanded bits and update the offset.
-        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
-        auto *Offset = II->getArgOperand(OffsetIdx);
-        unsigned SingleComponentSizeInBits =
-            getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
-        unsigned OffsetAdd =
-            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
-        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
-        Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
-      }
-    }
-  } else {
-    // Image case.
-
-    ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
-    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
-
-    // Mask off values that are undefined because the dmask doesn't cover them
-    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
-
-    unsigned NewDMaskVal = 0;
-    unsigned OrigLoadIdx = 0;
-    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
-      const unsigned Bit = 1 << SrcIdx;
-      if (!!(DMaskVal & Bit)) {
-        if (!!DemandedElts[OrigLoadIdx])
-          NewDMaskVal |= Bit;
-        OrigLoadIdx++;
-      }
-    }
-
-    if (DMaskVal != NewDMaskVal)
-      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
-  }
-
-  unsigned NewNumElts = DemandedElts.countPopulation();
-  if (!NewNumElts)
-    return UndefValue::get(II->getType());
-
-  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
-    if (DMaskIdx >= 0)
-      II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
-    return nullptr;
-  }
-
-  // Validate function argument and return types, extracting overloaded types
-  // along the way.
-  SmallVector<Type *, 6> OverloadTys;
-  if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys))
-    return nullptr;
-
-  Module *M = II->getParent()->getParent()->getParent();
-  Type *EltTy = IIVTy->getElementType();
-  Type *NewTy =
-      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
-
-  OverloadTys[0] = NewTy;
-  Function *NewIntrin =
-      Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);
-
-  CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
-  NewCall->takeName(II);
-  NewCall->copyMetadata(*II);
-
-  if (NewNumElts == 1) {
-    return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
-                                       DemandedElts.countTrailingZeros());
-  }
-
-  SmallVector<int, 8> EltMask;
-  unsigned NewLoadIdx = 0;
-  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
-    if (!!DemandedElts[OrigLoadIdx])
-      EltMask.push_back(NewLoadIdx++);
-    else
-      EltMask.push_back(NewNumElts);
-  }
-
-  Value *Shuffle =
-      Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
-
-  return Shuffle;
-}
-
 /// The specified value produces a vector with any number of elements.
 /// This method analyzes which elements of the operand are undef and returns
 /// that information in UndefElts.
@@ -1182,10 +992,11 @@
 /// If the information about demanded elements can be used to simplify the
 /// operation, the operation is simplified, then the resultant value is
 /// returned.  This returns null if no change was made.
-Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                                APInt &UndefElts,
-                                                unsigned Depth,
-                                                bool AllowMultipleUsers) {
+Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
+                                                    APInt DemandedElts,
+                                                    APInt &UndefElts,
+                                                    unsigned Depth,
+                                                    bool AllowMultipleUsers) {
   // Cannot analyze scalable type. The number of vector elements is not a
   // compile-time constant.
   if (isa<ScalableVectorType>(V->getType()))
@@ -1292,12 +1103,12 @@
     };
     if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
       break;
-    
+
     // Conservatively track the demanded elements back through any vector
     // operands we may have.  We know there must be at least one, or we
     // wouldn't have a vector result to get here. Note that we intentionally
     // merge the undef bits here since gepping with either an undef base or
-    // index results in undef. 
+    // index results in undef.
     for (unsigned i = 0; i < I->getNumOperands(); i++) {
       if (isa<UndefValue>(I->getOperand(i))) {
         // If the entire vector is undefined, just return this info.
@@ -1621,227 +1432,19 @@
       if (II->getIntrinsicID() == Intrinsic::masked_gather)
         simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
       simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
-      
+
       // Output elements are undefined if the element from both sources are.
       // TODO: can strengthen via mask as well.
       UndefElts = UndefElts2 & UndefElts3;
       break;
     }
-    case Intrinsic::x86_xop_vfrcz_ss:
-    case Intrinsic::x86_xop_vfrcz_sd:
-      // The instructions for these intrinsics are speced to zero upper bits not
-      // pass them through like other scalar intrinsics. So we shouldn't just
-      // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
-      // Instead we should return a zero vector.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return ConstantAggregateZero::get(II->getType());
-      }
-
-      // Only the lower element is used.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // Only the lower element is undefined. The high elements are zero.
-      UndefElts = UndefElts[0];
-      break;
-
-    // Unary scalar-as-vector operations that work column-wise.
-    case Intrinsic::x86_sse_rcp_ss:
-    case Intrinsic::x86_sse_rsqrt_ss:
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-      // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
-      // checks).
-      break;
-
-    // Binary scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0. The low element is a function of both
-    // operands.
-    case Intrinsic::x86_sse_min_ss:
-    case Intrinsic::x86_sse_max_ss:
-    case Intrinsic::x86_sse_cmp_ss:
-    case Intrinsic::x86_sse2_min_sd:
-    case Intrinsic::x86_sse2_max_sd:
-    case Intrinsic::x86_sse2_cmp_sd: {
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-
-      // Lower element is undefined if both lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0])
-        UndefElts.clearBit(0);
-
-      break;
-    }
-
-    // Binary scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0 and the low element comes from operand 1.
-    case Intrinsic::x86_sse41_round_ss:
-    case Intrinsic::x86_sse41_round_sd: {
-      // Don't use the low element of operand 0.
-      APInt DemandedElts2 = DemandedElts;
-      DemandedElts2.clearBit(0);
-      simplifyAndSetOp(II, 0, DemandedElts2, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-
-      // Take the high undef elements from operand 0 and take the lower element
-      // from operand 1.
-      UndefElts.clearBit(0);
-      UndefElts |= UndefElts2[0];
-      break;
-    }
-
-    // Three input scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0 and the low element is a function of all
-    // three inputs.
-    case Intrinsic::x86_avx512_mask_add_ss_round:
-    case Intrinsic::x86_avx512_mask_div_ss_round:
-    case Intrinsic::x86_avx512_mask_mul_ss_round:
-    case Intrinsic::x86_avx512_mask_sub_ss_round:
-    case Intrinsic::x86_avx512_mask_max_ss_round:
-    case Intrinsic::x86_avx512_mask_min_ss_round:
-    case Intrinsic::x86_avx512_mask_add_sd_round:
-    case Intrinsic::x86_avx512_mask_div_sd_round:
-    case Intrinsic::x86_avx512_mask_mul_sd_round:
-    case Intrinsic::x86_avx512_mask_sub_sd_round:
-    case Intrinsic::x86_avx512_mask_max_sd_round:
-    case Intrinsic::x86_avx512_mask_min_sd_round:
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1 and 2.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-      simplifyAndSetOp(II, 2, DemandedElts, UndefElts3);
-
-      // Lower element is undefined if all three lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0] || !UndefElts3[0])
-        UndefElts.clearBit(0);
-
-      break;
-
-    case Intrinsic::x86_sse2_packssdw_128:
-    case Intrinsic::x86_sse2_packsswb_128:
-    case Intrinsic::x86_sse2_packuswb_128:
-    case Intrinsic::x86_sse41_packusdw:
-    case Intrinsic::x86_avx2_packssdw:
-    case Intrinsic::x86_avx2_packsswb:
-    case Intrinsic::x86_avx2_packusdw:
-    case Intrinsic::x86_avx2_packuswb:
-    case Intrinsic::x86_avx512_packssdw_512:
-    case Intrinsic::x86_avx512_packsswb_512:
-    case Intrinsic::x86_avx512_packusdw_512:
-    case Intrinsic::x86_avx512_packuswb_512: {
-      auto *Ty0 = II->getArgOperand(0)->getType();
-      unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
-      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
-
-      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
-      unsigned VWidthPerLane = VWidth / NumLanes;
-      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
-
-      // Per lane, pack the elements of the first input and then the second.
-      // e.g.
-      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
-      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
-      for (int OpNum = 0; OpNum != 2; ++OpNum) {
-        APInt OpDemandedElts(InnerVWidth, 0);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          unsigned LaneIdx = Lane * VWidthPerLane;
-          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
-            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
-            if (DemandedElts[Idx])
-              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
-          }
-        }
-
-        // Demand elements from the operand.
-        APInt OpUndefElts(InnerVWidth, 0);
-        simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts);
-
-        // Pack the operand's UNDEF elements, one lane at a time.
-        OpUndefElts = OpUndefElts.zext(VWidth);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
-          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
-          LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
-          UndefElts |= LaneElts;
-        }
-      }
-      break;
-    }
-
-    // PSHUFB
-    case Intrinsic::x86_ssse3_pshuf_b_128:
-    case Intrinsic::x86_avx2_pshuf_b:
-    case Intrinsic::x86_avx512_pshuf_b_512:
-    // PERMILVAR
-    case Intrinsic::x86_avx_vpermilvar_ps:
-    case Intrinsic::x86_avx_vpermilvar_ps_256:
-    case Intrinsic::x86_avx512_vpermilvar_ps_512:
-    case Intrinsic::x86_avx_vpermilvar_pd:
-    case Intrinsic::x86_avx_vpermilvar_pd_256:
-    case Intrinsic::x86_avx512_vpermilvar_pd_512:
-    // PERMV
-    case Intrinsic::x86_avx2_permd:
-    case Intrinsic::x86_avx2_permps: {
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts);
-      break;
-    }
-
-    // SSE4A instructions leave the upper 64-bits of the 128-bit result
-    // in an undefined state.
-    case Intrinsic::x86_sse4a_extrq:
-    case Intrinsic::x86_sse4a_extrqi:
-    case Intrinsic::x86_sse4a_insertq:
-    case Intrinsic::x86_sse4a_insertqi:
-      UndefElts.setHighBits(VWidth / 2);
-      break;
-    case Intrinsic::amdgcn_buffer_load:
-    case Intrinsic::amdgcn_buffer_load_format:
-    case Intrinsic::amdgcn_raw_buffer_load:
-    case Intrinsic::amdgcn_raw_buffer_load_format:
-    case Intrinsic::amdgcn_raw_tbuffer_load:
-    case Intrinsic::amdgcn_s_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_load_format:
-    case Intrinsic::amdgcn_struct_tbuffer_load:
-    case Intrinsic::amdgcn_tbuffer_load:
-      return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
     default: {
-      if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
-        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
-
+      // Handle target specific intrinsics
+      Optional<Value *> V = targetSimplifyDemandedVectorEltsIntrinsic(
+          *II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+          simplifyAndSetOp);
+      if (V.hasValue())
+        return V.getValue();
       break;
     }
     } // switch on IntrinsicID
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -85,7 +86,8 @@
 // If we have a PHI node with a vector type that is only used to feed
 // itself and be an operand of extractelement at a constant location,
 // try to replace the PHI of the vector type with a PHI of a scalar type.
-Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
+Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
+                                            PHINode *PN) {
   SmallVector<Instruction *, 2> Extracts;
   // The users we want the PHI to have are:
   // 1) The EI ExtractElement (we already know this)
@@ -321,7 +323,7 @@
   return UnionUsedElts;
 }
 
-Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   Value *SrcVec = EI.getVectorOperand();
   Value *Index = EI.getIndexOperand();
   if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
@@ -531,7 +533,7 @@
 /// shufflevector to replace one or more insert/extract pairs.
 static void replaceExtractElements(InsertElementInst *InsElt,
                                    ExtractElementInst *ExtElt,
-                                   InstCombiner &IC) {
+                                   InstCombinerImpl &IC) {
   VectorType *InsVecType = InsElt->getType();
   VectorType *ExtVecType = ExtElt->getVectorOperandType();
   unsigned NumInsElts = InsVecType->getNumElements();
@@ -614,7 +616,7 @@
 
 static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
                                          Value *PermittedRHS,
-                                         InstCombiner &IC) {
+                                         InstCombinerImpl &IC) {
   assert(V->getType()->isVectorTy() && "Invalid shuffle!");
   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
 
@@ -699,7 +701,7 @@
 /// first one, making the first one redundant.
 /// It should be transformed to:
 ///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
-Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
+Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
   bool IsRedundant = false;
   ArrayRef<unsigned int> FirstIndices = I.getIndices();
 
@@ -1041,7 +1043,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
   Value *IdxOp    = IE.getOperand(2);
@@ -1525,7 +1527,7 @@
       is_contained(Mask, UndefMaskElem) &&
       (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
   if (MightCreatePoisonOrUB)
-    NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true);
+    NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);
 
   // shuf (bop X, C), X, M --> bop X, C'
   // shuf X, (bop X, C), M --> bop X, C'
@@ -1652,7 +1654,8 @@
       is_contained(Mask, UndefMaskElem) &&
       (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
   if (MightCreatePoisonOrUB)
-    NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1);
+    NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
+                                                       ConstantsAreOp1);
 
   Value *V;
   if (X == Y) {
@@ -1823,7 +1826,7 @@
 /// Try to replace a shuffle with an insertelement or try to replace a shuffle
 /// operand with the operand of an insertelement.
 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
-                                          InstCombiner &IC) {
+                                          InstCombinerImpl &IC) {
   Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
   SmallVector<int, 16> Mask;
   Shuf.getShuffleMask(Mask);
@@ -1974,7 +1977,7 @@
   return new ShuffleVectorInst(X, Y, NewMask);
 }
 
-Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
   SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -160,7 +161,41 @@
 static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
                                                cl::Hidden, cl::init(true));
 
-Value *InstCombiner::EmitGEPOffset(User *GEP) {
+Optional<Instruction *>
+InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.instCombineIntrinsic(*this, II);
+  }
+  return None;
+}
+
+Optional<Value *> InstCombiner::targetSimplifyDemandedUseBitsIntrinsic(
+    IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.simplifyDemandedUseBitsIntrinsic(*this, II, DemandedMask, Known,
+                                                KnownBitsComputed);
+  }
+  return None;
+}
+
+Optional<Value *> InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic(
+    IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2,
+    APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.simplifyDemandedVectorEltsIntrinsic(
+        *this, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
+  return None;
+}
+
+Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(&Builder, DL, GEP);
 }
 
@@ -173,8 +208,8 @@
 /// legal to convert to, in order to open up more combining opportunities.
 /// NOTE: this treats i8, i16 and i32 specially, due to them being so common
 /// from frontend languages.
-bool InstCombiner::shouldChangeType(unsigned FromWidth,
-                                    unsigned ToWidth) const {
+bool InstCombinerImpl::shouldChangeType(unsigned FromWidth,
+                                        unsigned ToWidth) const {
   bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
   bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
@@ -201,7 +236,7 @@
 /// to a larger illegal type. i1 is always treated as a legal type because it is
 /// a fundamental type in IR, and there are many specialized optimizations for
 /// i1 types.
-bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
+bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
   // TODO: This could be extended to allow vectors. Datalayout changes might be
   // needed to properly support that.
   if (!From->isIntegerTy() || !To->isIntegerTy())
@@ -269,7 +304,8 @@
 /// cast to eliminate one of the associative operations:
 /// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
 /// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
-static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
+static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1,
+                                   InstCombinerImpl &IC) {
   auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
   if (!Cast || !Cast->hasOneUse())
     return false;
@@ -327,7 +363,7 @@
 ///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
 ///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
 ///     if C1 and C2 are constants.
-bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
+bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
   Instruction::BinaryOps Opcode = I.getOpcode();
   bool Changed = false;
 
@@ -555,9 +591,10 @@
 
 /// This tries to simplify binary operations by factorizing out common terms
 /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
-Value *InstCombiner::tryFactorization(BinaryOperator &I,
-                                      Instruction::BinaryOps InnerOpcode,
-                                      Value *A, Value *B, Value *C, Value *D) {
+Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
+                                          Instruction::BinaryOps InnerOpcode,
+                                          Value *A, Value *B, Value *C,
+                                          Value *D) {
   assert(A && B && C && D && "All values must be provided");
 
   Value *V = nullptr;
@@ -660,7 +697,7 @@
 /// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
 /// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
 /// Returns the simplified value, or null if it didn't simplify.
-Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
+Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
@@ -774,8 +811,9 @@
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
-Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
-                                                    Value *LHS, Value *RHS) {
+Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
+                                                        Value *LHS,
+                                                        Value *RHS) {
   Value *A, *B, *C, *D, *E, *F;
   bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
   bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
@@ -827,7 +865,7 @@
 
 /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
 /// constant zero (which is the 'negate' form).
-Value *InstCombiner::dyn_castNegVal(Value *V) const {
+Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
   Value *NegV;
   if (match(V, m_Neg(m_Value(NegV))))
     return NegV;
@@ -888,7 +926,8 @@
   return RI;
 }
 
-Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
+Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
+                                                SelectInst *SI) {
   // Don't modify shared select instructions.
   if (!SI->hasOneUse())
     return nullptr;
@@ -983,7 +1022,7 @@
   return RI;
 }
 
-Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return nullptr;
@@ -1125,7 +1164,7 @@
   return replaceInstUsesWith(I, NewPN);
 }
 
-Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
   if (!isa<Constant>(I.getOperand(1)))
     return nullptr;
 
@@ -1143,8 +1182,9 @@
 /// is a sequence of GEP indices into the pointed type that will land us at the
 /// specified offset. If so, fill them into NewIndices and return the resultant
 /// element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
-                                        SmallVectorImpl<Value *> &NewIndices) {
+Type *
+InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+                                      SmallVectorImpl<Value *> &NewIndices) {
   Type *Ty = PtrTy->getElementType();
   if (!Ty->isSized())
     return nullptr;
@@ -1213,7 +1253,7 @@
 
 /// Return a value X such that Val = X * Scale, or null if none.
 /// If the multiplication is known not to overflow, then NoSignedWrap is set.
-Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
+Value *InstCombinerImpl::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
   assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
          Scale.getBitWidth() && "Scale not compatible with value!");
@@ -1453,7 +1493,7 @@
   } while (true);
 }
 
-Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
+Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   // FIXME: some of this is likely fine for scalable vectors
   if (!isa<FixedVectorType>(Inst.getType()))
     return nullptr;
@@ -1675,7 +1715,7 @@
 /// Try to narrow the width of a binop if at least 1 operand is an extend of
 /// of a value. This requires a potentially expensive known bits check to make
 /// sure the narrow op does not overflow.
-Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) {
+Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
   // We need at least one extended operand.
   Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);
 
@@ -1764,7 +1804,7 @@
   return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }
 
-Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
@@ -2516,7 +2556,7 @@
   return true;
 }
 
-Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
+Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons to
   // null and free calls, delete the calls and replace the comparisons with true
   // or false as appropriate.
@@ -2675,7 +2715,7 @@
   return &FI;
 }
 
-Instruction *InstCombiner::visitFree(CallInst &FI) {
+Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
   Value *Op = FI.getArgOperand(0);
 
   // free undef -> unreachable.
@@ -2716,7 +2756,7 @@
   return false;
 }
 
-Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
+Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
   if (RI.getNumOperands() == 0) // ret void
     return nullptr;
 
@@ -2739,7 +2779,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
+Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) {
   assert(BI.isUnconditional() && "Only for unconditional branches.");
 
   // If this store is the second-to-last instruction in the basic block
@@ -2768,7 +2808,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
   if (BI.isUnconditional())
     return visitUnconditionalBranchInst(BI);
 
@@ -2804,7 +2844,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) {
   Value *Cond = SI.getCondition();
   Value *Op0;
   ConstantInt *AddRHS;
@@ -2835,7 +2875,7 @@
   unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
   // Shrink the condition operand if the new type is smaller than the old type.
-  // But do not shrink to a non-standard type, because backend can't generate 
+  // But do not shrink to a non-standard type, because backend can't generate
   // good code for that yet.
   // TODO: We can make it aggressive again after fixing PR39569.
   if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
@@ -2854,7 +2894,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
   Value *Agg = EV.getAggregateOperand();
 
   if (!EV.hasIndices())
@@ -3015,7 +3055,7 @@
     cast<ArrayType>(RHS->getType())->getNumElements();
 }
 
-Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
   // The logic here should be correct for any real-world personality function.
   // However if that turns out not to be true, the offending logic can always
   // be conditioned on the personality function, like the catch-all logic is.
@@ -3324,7 +3364,7 @@
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
+Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   Value *Op0 = I.getOperand(0);
 
   if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
@@ -3435,7 +3475,7 @@
   return true;
 }
 
-bool InstCombiner::run() {
+bool InstCombinerImpl::run() {
   while (!Worklist.isEmpty()) {
     // Walk deferred instructions in reverse order, and push them to the
     // worklist, which means they'll end up popped from the worklist in-order.
@@ -3718,8 +3758,8 @@
 
 static bool combineInstructionsOverFunction(
     Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
-    AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
-    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+    AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+    DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
   auto &DL = F.getParent()->getDataLayout();
   MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
@@ -3763,8 +3803,8 @@
 
     MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
-    InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
-                    AC, TLI, DT, ORE, BFI, PSI, DL, LI);
+    InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
+                        ORE, BFI, PSI, DL, LI);
     IC.MaxArraySizeForCombine = MaxArraySize;
 
     if (!IC.run())
@@ -3787,6 +3827,7 @@
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
 
   auto *LI = AM.getCachedResult<LoopAnalysis>(F);
 
@@ -3797,8 +3838,8 @@
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
 
-  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                       PSI, MaxIterations, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
+                                       BFI, PSI, MaxIterations, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3816,6 +3857,7 @@
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
@@ -3834,6 +3876,7 @@
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
@@ -3847,8 +3890,8 @@
       &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
       nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                         PSI, MaxIterations, LI);
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
+                                         BFI, PSI, MaxIterations, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
@@ -3867,6 +3910,7 @@
                       "Combine redundant instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
 
 declare <16 x i1> @llvm.arm.mve.vctp8(i32)
 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -instcombine -S %s | FileCheck --check-prefix=IR %s
-; RUN: opt -instcombine    %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main -S %s | FileCheck --check-prefix=IR %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main    %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s
 
 %struct.foo = type { [2 x <4 x i32>] }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main-none-eabi %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -42,4 +42,3 @@
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
 declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>)
 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
-
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -instcombine %s | FileCheck %s
+; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.buffer.load
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -instcombine -S < %s | FileCheck %s
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.rcp
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll
--- a/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s
 
 define float @ldexp_f32_undef_undef() {
 ; CHECK-LABEL: @ldexp_f32_undef_undef(
diff --git a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
--- a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S -o - %s | FileCheck %s
+; RUN: opt -instcombine -S -mtriple=arm -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
--- a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=arm -S | FileCheck %s
 
 ; The alignment arguments for NEON load/store intrinsics can be increased
 ; by instcombine.  Check for this.
diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
--- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -6,11 +6,11 @@
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }' >> %t.ftz
-; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
+; RUN: opt < %t.ftz -instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "ieee" }' >> %t.noftz
-; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
+; RUN: opt < %t.noftz -instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
 
 ; We handle nvvm intrinsics with ftz variants as follows:
 ;  - If the module is in ftz mode, the ftz variant is transformed into the
diff --git a/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll b/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
--- a/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
+++ b/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 ; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
 
diff --git a/llvm/test/Transforms/InstCombine/X86/addcarry.ll b/llvm/test/Transforms/InstCombine/X86/addcarry.ll
--- a/llvm/test/Transforms/InstCombine/X86/addcarry.ll
+++ b/llvm/test/Transforms/InstCombine/X86/addcarry.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
 declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)
@@ -35,4 +35,3 @@
   %r = extractvalue { i8, i64 } %s, 1
   ret i64 %r
 }
-
diff --git a/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll b/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll
--- a/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll
+++ b/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8)
 declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8)
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Verify that instcombine is able to fold identity shuffles.
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone
 declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll b/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 ;; MASKED LOADS
 
@@ -325,4 +325,3 @@
 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>)
 
 declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*)
-
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pack.ll b/llvm/test/Transforms/InstCombine/X86/x86-pack.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-pack.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pack.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 ;
 ; UNDEF Elts
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll b/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 ; Verify that instcombine is able to fold identity shuffles.
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define float @test_rcp_ss_0(float %a) {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define double @test_sqrt_sd_0(double %a) {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 ;
 ; EXTRQ
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i16 @test1(float %f) {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ;
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Verify that instcombine is able to fold identity shuffles.
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-xop.ll b/llvm/test/Transforms/InstCombine/X86/x86-xop.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-xop.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-xop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 define <2 x double> @test_vfrcz_sd(<2 x double> %a) {
 ; CHECK-LABEL: @test_vfrcz_sd(