Index: include/llvm/CodeGen/MachineRegisterInfo.h
===================================================================
--- include/llvm/CodeGen/MachineRegisterInfo.h
+++ include/llvm/CodeGen/MachineRegisterInfo.h
@@ -84,14 +84,15 @@
   /// all registers that were disabled are removed from the list.
   SmallVector<MCPhysReg, 16> UpdatedCSRs;
 
-  /// RegAllocHints - This vector records register allocation hints for virtual
-  /// registers. For each virtual register, it keeps a register and hint type
-  /// pair making up the allocation hint. Hint type is target specific except
-  /// for the value 0 which means the second value of the pair is the preferred
-  /// register for allocation. For example, if the hint is <0, 1024>, it means
-  /// the allocator should prefer the physical register allocated to the virtual
-  /// register of the hint.
-  IndexedMap<std::pair<unsigned, unsigned>, VirtReg2IndexFunctor> RegAllocHints;
+  /// RegAllocHints - This vector records register allocation hints for
+  /// virtual registers. For each virtual register, it keeps a pair of hint
+  /// type and hints vector making up the allocation hints. Only the first
+  /// hint may be target specific, and in that case this is reflected by the
+  /// first member of the pair being non-zero. If the hinted register is
+  /// virtual, it means the allocator should prefer the physical register
+  /// allocated to it if any.
+  IndexedMap<std::pair<unsigned, SmallVector<unsigned, 4>>,
+             VirtReg2IndexFunctor> RegAllocHints;
 
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
@@ -702,35 +703,61 @@
   void clearVirtRegs();
 
   /// setRegAllocationHint - Specify a register allocation hint for the
-  /// specified virtual register.
+  /// specified virtual register. This is typically used by target, and in case
+  /// of an earlier hint it will be overwritten.
   void setRegAllocationHint(unsigned VReg, unsigned Type, unsigned PrefReg) {
     assert(TargetRegisterInfo::isVirtualRegister(VReg));
     RegAllocHints[VReg].first  = Type;
-    RegAllocHints[VReg].second = PrefReg;
+    RegAllocHints[VReg].second.clear();
+    RegAllocHints[VReg].second.push_back(PrefReg);
   }
 
-  /// Specify the preferred register allocation hint for the specified virtual
-  /// register.
+  /// addRegAllocationHint - Add a register allocation hint to the hints
+  /// vector for VReg.
+  void addRegAllocationHint(unsigned VReg, unsigned PrefReg) {
+    assert(TargetRegisterInfo::isVirtualRegister(VReg));
+    RegAllocHints[VReg].second.push_back(PrefReg);
+  }
+
+  /// Specify the preferred (target independent) register allocation hint for
+  /// the specified virtual register.
   void setSimpleHint(unsigned VReg, unsigned PrefReg) {
     setRegAllocationHint(VReg, /*Type=*/0, PrefReg);
   }
 
+  /// Clear any previous register allocation hints for VReg.
+  void clearRegAllocationHints(unsigned VReg) {
+    RegAllocHints[VReg].first = 0;
+    RegAllocHints[VReg].second.clear();
+  }
+
   /// getRegAllocationHint - Return the register allocation hint for the
-  /// specified virtual register.
+  /// specified virtual register. If there are many hints, this returns the
+  /// one with the greatest weight.
   std::pair<unsigned, unsigned>
   getRegAllocationHint(unsigned VReg) const {
     assert(TargetRegisterInfo::isVirtualRegister(VReg));
-    return RegAllocHints[VReg];
+    unsigned BestHint = (RegAllocHints[VReg].second.size() ?
+                         RegAllocHints[VReg].second[0] : 0);
+    return std::pair<unsigned, unsigned>(RegAllocHints[VReg].first, BestHint);
   }
 
-  /// getSimpleHint - Return the preferred register allocation hint, or 0 if a
-  /// standard simple hint (Type == 0) is not set.
+  /// getSimpleHint - same as getRegAllocationHint except it will only return
+  /// a target independent hint.
   unsigned getSimpleHint(unsigned VReg) const {
     assert(TargetRegisterInfo::isVirtualRegister(VReg));
     std::pair<unsigned, unsigned> Hint = getRegAllocationHint(VReg);
     return Hint.first ? 0 : Hint.second;
   }
 
+  /// getRegAllocationHints - Return a reference to the vector of all
+  /// register allocation hints for VReg.
+  const std::pair<unsigned, SmallVector<unsigned, 4>>
+  &getRegAllocationHints(unsigned VReg) const {
+    assert(TargetRegisterInfo::isVirtualRegister(VReg));
+    return RegAllocHints[VReg];
+  }
+
   /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
   /// specified register as undefined which causes the DBG_VALUE to be
   /// deleted during LiveDebugVariables analysis.
Index: include/llvm/Target/TargetRegisterInfo.h
===================================================================
--- include/llvm/Target/TargetRegisterInfo.h
+++ include/llvm/Target/TargetRegisterInfo.h
@@ -784,11 +784,10 @@
   /// as returned from RegisterClassInfo::getOrder(). The hint registers must
   /// come from Order, and they must not be reserved.
   ///
-  /// The default implementation of this function can resolve
-  /// target-independent hints provided to MRI::setRegAllocationHint with
-  /// HintType == 0. Targets that override this function should defer to the
-  /// default implementation if they have no reason to change the allocation
-  /// order for VirtReg. There may be target-independent hints.
+  /// The default implementation of this function will only add target
+  /// independent register allocation hints. Targets that override this
+  /// function should typically call this default implementation as well and
+  /// expect to see generic copy hints added.
   virtual void getRegAllocationHints(unsigned VirtReg,
                                      ArrayRef<MCPhysReg> Order,
                                      SmallVectorImpl<MCPhysReg> &Hints,
Index: lib/CodeGen/CalcSpillWeights.cpp
===================================================================
--- lib/CodeGen/CalcSpillWeights.cpp
+++ lib/CodeGen/CalcSpillWeights.cpp
@@ -69,14 +69,16 @@
   if (TargetRegisterInfo::isVirtualRegister(hreg))
     return sub == hsub ? hreg : 0;
 
+  unsigned CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg);
   const TargetRegisterClass *rc = mri.getRegClass(reg);
+  if (rc->contains(CopiedPReg))
+    return CopiedPReg;
 
-  // Only allow physreg hints in rc.
-  if (sub == 0)
-    return rc->contains(hreg) ? hreg : 0;
+  // Check if reg:sub matches so that a super register could be hinted.
+  if (sub)
+    return tri.getMatchingSuperReg(CopiedPReg, sub, rc);
 
-  // reg:sub should match the physreg hreg.
-  return tri.getMatchingSuperReg(hreg, sub, rc);
+  return 0;
 }
 
 // Check if all values in LI are rematerializable
@@ -144,16 +146,27 @@
   unsigned numInstr = 0; // Number of instructions using li
   SmallPtrSet<MachineInstr*, 8> visited;
 
-  // Find the best physreg hint and the best virtreg hint.
-  float bestPhys = 0, bestVirt = 0;
-  unsigned hintPhys = 0, hintVirt = 0;
-
-  // Don't recompute a target specific hint.
-  bool noHint = mri.getRegAllocationHint(li.reg).first != 0;
-
   // Don't recompute spill weight for an unspillable register.
   bool Spillable = li.isSpillable();
 
+  // CopyHint is a sortable hint derived from a COPY instruction.
+  struct CopyHint {
+    unsigned Reg;
+    float Weight;
+    bool IsPhys;
+    CopyHint(unsigned R, float W, bool P) : Reg(R), Weight(W), IsPhys(P) {}
+    bool operator<(const CopyHint &rhs) const {
+      // Always prefer any physreg hint.
+      if (IsPhys != rhs.IsPhys)
+        return (IsPhys && !rhs.IsPhys);
+      if (Weight != rhs.Weight)
+        return (Weight > rhs.Weight);
+      // (just for the purpose of maintaining the set)
+      return Reg < rhs.Reg;
+    }
+  };
+
+  std::set<CopyHint> CopyHints;
   for (MachineRegisterInfo::reg_instr_iterator
        I = mri.reg_instr_begin(li.reg), E = mri.reg_instr_end();
        I != E; ) {
@@ -186,7 +199,7 @@
     }
 
     // Get allocation hints from copies.
-    if (noHint || !mi->isCopy())
+    if (!mi->isCopy())
       continue;
     unsigned hint = copyHint(mi, li.reg, tri, mri);
     if (!hint)
@@ -196,27 +209,28 @@
     //
     // FIXME: we probably shouldn't use floats at all.
     volatile float hweight = Hint[hint] += weight;
-    if (TargetRegisterInfo::isPhysicalRegister(hint)) {
-      if (hweight > bestPhys && mri.isAllocatable(hint)) {
-        bestPhys = hweight;
-        hintPhys = hint;
-      }
-    } else {
-      if (hweight > bestVirt) {
-        bestVirt = hweight;
-        hintVirt = hint;
-      }
-    }
+    CopyHints.insert(CopyHint(hint, hweight, tri.isPhysicalRegister(hint)));
   }
 
   Hint.clear();
 
-  // Always prefer the physreg hint.
-  if (unsigned hint = hintPhys ? hintPhys : hintVirt) {
-    mri.setRegAllocationHint(li.reg, 0, hint);
+  std::pair<unsigned, unsigned> TargetHint = mri.getRegAllocationHint(li.reg);
+  if (TargetHint.first == 0 && TargetHint.second)
+    // Forget any previous generic hints, as they are now recomputed.
+    mri.clearRegAllocationHints(li.reg);
+
+  // Pass all the sorted copy hints to mri.
+  for (auto &Hint : CopyHints) {
+    if (TargetHint.first != 0 && Hint.Reg == TargetHint.second)
+      // Don't add a register already hinted with target type. It will be
+      // added later with a higher priority than these copy hints.
+      continue;
+    mri.addRegAllocationHint(li.reg, Hint.Reg);
+  }
+
+  if (CopyHints.size())
     // Weakly boost the spill weight of hinted registers.
     totalWeight *= 1.01F;
-  }
 
   // If the live interval was already unspillable, leave it that way.
   if (!Spillable)
Index: lib/CodeGen/TargetRegisterInfo.cpp
===================================================================
--- lib/CodeGen/TargetRegisterInfo.cpp
+++ lib/CodeGen/TargetRegisterInfo.cpp
@@ -368,31 +368,36 @@
                                           const VirtRegMap *VRM,
                                           const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
-
-  // Hints with HintType != 0 were set by target-dependent code.
-  // Such targets must provide their own implementation of
-  // TRI::getRegAllocationHints to interpret those hint types.
-  assert(Hint.first == 0 && "Target must implement TRI::getRegAllocationHints");
-
-  // Target-independent hints are either a physical or a virtual register.
-  unsigned Phys = Hint.second;
-  if (VRM && isVirtualRegister(Phys))
-    Phys = VRM->getPhys(Phys);
-
-  // Check that Phys is a valid hint in VirtReg's register class.
-  if (!isPhysicalRegister(Phys))
-    return;
-  if (MRI.isReserved(Phys))
-    return;
-  // Check that Phys is in the allocation order. We shouldn't heed hints
-  // from VirtReg's register class if they aren't in the allocation order. The
-  // target probably has a reason for removing the register.
-  if (!is_contained(Order, Phys))
-    return;
-
-  // All clear, tell the register allocator to prefer this register.
-  Hints.push_back(Phys);
+  const std::pair<unsigned, SmallVector<unsigned, 4>> &Hints_MRI =
+    MRI.getRegAllocationHints(VirtReg);
+
+  // First hint may be a target hint.
+  bool Skip = (Hints_MRI.first != 0);
+  for (auto Reg : Hints_MRI.second) {
+    if (Skip) {
+      Skip = false;
+      continue;
+    }
+
+    // Target-independent hints are either a physical or a virtual register.
+    unsigned Phys = Reg;
+    if (VRM && isVirtualRegister(Phys))
+      Phys = VRM->getPhys(Phys);
+
+    // Check that Phys is a valid hint in VirtReg's register class.
+    if (!isPhysicalRegister(Phys))
+      continue;
+    if (MRI.isReserved(Phys))
+      continue;
+    // Check that Phys is in the allocation order. We shouldn't heed hints
+    // from VirtReg's register class if they aren't in the allocation order. The
+    // target probably has a reason for removing the register.
+    if (!is_contained(Order, Phys))
+      continue;
+
+    // All clear, tell the register allocator to prefer this register.
+    Hints.push_back(Phys);
+  }
 }
 
 bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
Index: test/CodeGen/AArch64/arm64-aapcs.ll
===================================================================
--- test/CodeGen/AArch64/arm64-aapcs.ll
+++ test/CodeGen/AArch64/arm64-aapcs.ll
@@ -5,20 +5,20 @@
 ; CHECK-LABEL: @test_i128_align
 define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
   store i32 %after, i32* @var, align 4
-; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+; CHECK-DAG: str w4, [{{x[0-9]+}}, :lo12:var]
 
   ret i128 %arg
-; CHECK: mov x0, x2
-; CHECK: mov x1, x3
+; CHECK-DAG: mov x0, x2
+; CHECK-DAG: mov x1, x3
 }
 
 ; CHECK-LABEL: @test_i64x2_align
 define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) {
   store i32 %after, i32* @var, align 4
-; CHECK: str w3, [{{x[0-9]+}}, :lo12:var]
+; CHECK-DAG: str w3, [{{x[0-9]+}}, :lo12:var]
 
   ret [2 x i64] %arg
-; CHECK: mov x0, x1
+; CHECK-DAG: mov x0, x1
 ; CHECK: mov x1, x2
 }
 
Index: test/CodeGen/AArch64/func-argpassing.ll
===================================================================
--- test/CodeGen/AArch64/func-argpassing.ll
+++ test/CodeGen/AArch64/func-argpassing.ll
@@ -164,11 +164,11 @@
 define i64 @check_i128_regalign(i32 %val0, i128 %val1, i64 %val2) {
 ; CHECK-LABEL: check_i128_regalign
     store i128 %val1, i128* @var128
-; CHECK: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
+; CHECK-DAG: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
 ; CHECK-DAG: stp x2, x3, [x[[VAR128]]]
 
     ret i64 %val2
-; CHECK: mov x0, x4
+; CHECK-DAG: mov x0, x4
 }
 
 define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
Index: test/CodeGen/AArch64/swifterror.ll
===================================================================
--- test/CodeGen/AArch64/swifterror.ll
+++ test/CodeGen/AArch64/swifterror.ll
@@ -40,11 +40,11 @@
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE: mov x0, x21
+; CHECK-APPLE: cbnz x0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
@@ -263,11 +263,11 @@
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_sret
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE: mov x0, x21
+; CHECK-APPLE: cbnz x0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
@@ -358,11 +358,11 @@
 
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE: mov x0, x21
+; CHECK-APPLE: cbnz x0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
Index: test/CodeGen/AArch64/win64_vararg.ll
===================================================================
--- test/CodeGen/AArch64/win64_vararg.ll
+++ test/CodeGen/AArch64/win64_vararg.ll
@@ -161,25 +161,25 @@
 ; CHECK: add     x8, x8, #15
 ; CHECK: mov     x9, sp
 ; CHECK: and     x8, x8, #0x1fffffff0
-; CHECK: sub     x20, x9, x8
+; CHECK: sub     [[REG:x[0-9]+]], x9, x8
 ; CHECK: mov     x19, x1
-; CHECK: mov     x23, sp
+; CHECK: mov     [[REG2:x[0-9]+]], sp
 ; CHECK: stp     x6, x7, [x29, #48]
 ; CHECK: stp     x4, x5, [x29, #32]
 ; CHECK: stp     x2, x3, [x29, #16]
-; CHECK: mov     sp, x20
-; CHECK: ldur    x21, [x29, #-40]
-; CHECK: sxtw    x22, w0
+; CHECK: mov     sp, [[REG]]
+; CHECK: ldur    [[REG3:x[0-9]+]], [x29, #-40]
+; CHECK: sxtw    [[REG4:x[0-9]+]], w0
 ; CHECK: bl      __local_stdio_printf_options
 ; CHECK: ldr     x8, [x0]
-; CHECK: mov     x1, x20
-; CHECK: mov     x2, x22
+; CHECK: mov     x1, [[REG]]
+; CHECK: mov     x2, [[REG4]]
 ; CHECK: mov     x3, x19
 ; CHECK: orr     x0, x8, #0x2
 ; CHECK: mov     x4, xzr
-; CHECK: mov     x5, x21
+; CHECK: mov     x5, [[REG3]]
 ; CHECK: bl      __stdio_common_vsprintf
-; CHECK: mov     sp, x23
+; CHECK: mov     sp, [[REG2]]
 ; CHECK: sub     sp, x29, #48
 ; CHECK: ldp     x29, x30, [sp, #48]
 ; CHECK: ldp     x20, x19, [sp, #32]
@@ -255,17 +255,15 @@
 
 ; CHECK-LABEL: fixed_params
 ; CHECK: sub     sp,  sp, #32
-; CHECK: mov     w8,  w3
-; CHECK: mov     w9,  w2
-; CHECK: mov     w10, w1
+; CHECK-DAG: mov     [[REG0:w[0-9]+]],  w3
+; CHECK-DAG: mov     [[REG1:w[0-9]+]],  w2
+; CHECK: mov     [[REG2:w[0-9]+]], w1
 ; CHECK: str     w4,  [sp]
 ; CHECK: fmov    x1,  d0
 ; CHECK: fmov    x3,  d1
 ; CHECK: fmov    x5,  d2
 ; CHECK: fmov    x7,  d3
-; CHECK: mov     w2,  w10
-; CHECK: mov     w4,  w9
-; CHECK: mov     w6,  w8
+; CHECK: mov     w4,  [[REG1]]
 ; CHECK: str     x30, [sp, #16]
 ; CHECK: str     d4,  [sp, #8]
 ; CHECK: bl      varargs
Index: test/CodeGen/AMDGPU/addrspacecast.ll
===================================================================
--- test/CodeGen/AMDGPU/addrspacecast.ll
+++ test/CodeGen/AMDGPU/addrspacecast.ll
@@ -10,10 +10,10 @@
 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
-; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
-; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_cmp_ne_u32_e64 s[0:1], [[PTR]], -1
+; CI-DAG: v_cndmask_b32_e64 v[[HI:[0-9]+]], 0, [[VAPERTURE]], s[0:1]
 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; CI-DAG: v_cndmask_b32_e64 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
@@ -22,17 +22,17 @@
 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
 
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
-; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
-; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9: v_cmp_ne_u32_e64 s[0:1], [[PTR]], -1
+; GFX9: v_cndmask_b32_e64 v[[HI:[0-9]+]], 0, [[VAPERTURE]], s[0:1]
 ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; GFX9-DAG: v_cndmask_b32_e64 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
 
 ; At most 2 digits. Make sure src_shared_base is not counted as a high
 ; number SGPR.
 
-; CI: NumSgprs: {{[0-9][0-9]+}}
+; CI: NumSgprs: {{[0-9][0-9]?}}
 ; GFX9: NumSgprs: {{[0-9]+}}
 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
@@ -51,10 +51,10 @@
 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
-; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_cmp_ne_u32_e64 s[0:1], [[PTR]], 0
+; CI-DAG: v_cndmask_b32_e64 v[[HI:[0-9]+]], 0, [[VAPERTURE]], s[0:1]
 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; CI-DAG: v_cndmask_b32_e64 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16)
@@ -64,14 +64,14 @@
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
-; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9: v_cmp_ne_u32_e64 s[0:1], [[PTR]], 0
+; GFX9: v_cndmask_b32_e64 v[[HI:[0-9]+]], 0, [[VAPERTURE]], s[0:1]
 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; GFX9-DAG: v_cndmask_b32_e64 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
 
-; CI: NumSgprs: {{[0-9][0-9]+}}
+; CI: NumSgprs: {{[0-9][0-9]?}}
 ; GFX9: NumSgprs: {{[0-9]+}}
 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
@@ -112,9 +112,9 @@
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
-; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
+; HSA-DAG: v_cmp_ne_u64_e64 s[0:1], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_cndmask_b32_e64 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
@@ -129,9 +129,9 @@
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
-; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
+; HSA-DAG: v_cmp_ne_u64_e64 s[0:1], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
+; HSA-DAG: v_cndmask_b32_e64 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
@@ -269,7 +269,7 @@
 ; HSA-LABEL: {{^}}store_flat_scratch:
 ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
 ; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
-; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
 
 ; GFX9: s_add_u32 flat_scratch_lo, s6, s9
 ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
Index: test/CodeGen/AMDGPU/anyext.ll
===================================================================
--- test/CodeGen/AMDGPU/anyext.ll
+++ test/CodeGen/AMDGPU/anyext.ll
@@ -44,8 +44,8 @@
 ; GFX9: global_load_short_d16_hi
 ; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0x80008000
 ; GFX9: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GFX9: v_cmp_eq_f32_e32
-; GFX9: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
+; GFX9: v_cmp_eq_f32_e64
+; GFX9: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s[0:1]
 define amdgpu_kernel void @anyext_v2i16_to_v2i32() #0 {
 bb:
   %tmp = load i16, i16 addrspace(1)* undef, align 2
Index: test/CodeGen/AMDGPU/branch-condition-and.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-condition-and.ll
+++ test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -10,9 +10,9 @@
 ; that was not treated correctly.
 ;
 ; GCN-LABEL: {{^}}ham:
-; GCN-DAG: v_cmp_lt_f32_e64 [[OTHERCC:s\[[0-9]+:[0-9]+\]]],
-; GCN-DAG: v_cmp_lt_f32_e32 vcc,
-; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
+; GCN-DAG: v_cmp_lt_f32_e64 [[OTHERCC:s\[[0-9]+:[0-9]+\]]], 0, v0
+; GCN-DAG: v_cmp_lt_f32_e64 [[CC:s\[[0-9]+:[0-9]+\]]], 0, v1
+; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OTHERCC]], [[CC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
 
Index: test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-relaxation.ll
+++ test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -139,8 +139,8 @@
 ; GCN-LABEL: {{^}}min_long_forward_vbranch:
 
 ; GCN: buffer_load_dword
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN: v_cmp_ne_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
+; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[REG]]
 
 ; GCN: v_nop_e64
 ; GCN: v_nop_e64
@@ -382,8 +382,8 @@
 ; Requires expanding of required skip branch.
 
 ; GCN-LABEL: {{^}}uniform_inside_divergent:
-; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
-; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN: v_cmp_gt_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 16, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], [[REG]]
 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 
@@ -430,8 +430,8 @@
 ; si_mask_branch
 
 ; GCN-LABEL: {{^}}analyze_mask_branch:
-; GCN: v_cmp_lt_f32_e32 vcc
-; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN: v_cmp_lt_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]]
+; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], [[REG]]
 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop_body
@@ -485,13 +485,12 @@
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
 ; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
-; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
+; GCN: s_add
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 ; GCN: s_setpc_b64
 
-; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN-DAG: v_cmp_lt_i32
 ; GCN-DAG: v_cmp_gt_i32
 ; GCN: s_cbranch_vccnz
Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
===================================================================
--- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -208,8 +208,8 @@
 ; GCN: enable_sgpr_workgroup_id_z = 0
 
 ; GCN: s_mov_b32 s33, s8
-; GCN: s_mov_b32 s4, s33
-; GCN: s_mov_b32 s6, s7
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s6, s7
 ; GCN: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
@@ -223,8 +223,8 @@
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
 ; GCN: s_mov_b32 s33, s8
-; GCN: s_mov_b32 s4, s33
-; GCN: s_mov_b32 s6, s7
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s6, s7
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
   call void @use_workgroup_id_z()
@@ -396,7 +396,7 @@
 
 ; GCN-DAG: s_mov_b32 s33, s8
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s6, s7
 ; GCN-DAG: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64
@@ -412,7 +412,7 @@
 
 ; GCN: s_mov_b32 s33, s8
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s6, s7
 
 ; GCN: s_mov_b32 s32, s33
Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
===================================================================
--- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -220,8 +220,8 @@
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
 ; GCN: enable_vgpr_workitem_id = 2
 
-; GCN: v_mov_b32_e32 v0, 0x22b
-; GCN: v_mov_b32_e32 v1, v2
+; GCN-DAG: v_mov_b32_e32 v0, 0x22b
+; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: s_swappc_b64
 ; GCN-NOT: v0
 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -95,7 +95,7 @@
 
 ; GCN-LABEL: {{^}}loop_arg_0:
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; GCN: v_cmp_eq_u32_e32 vcc, 1,
+; GCN: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 1,
 
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80
Index: test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
===================================================================
--- test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -5,8 +5,8 @@
 ; Produces error after adding an implicit def to v_cndmask_b32
 
 ; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; GCN: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[REG]]
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
@@ -33,7 +33,7 @@
 
 ; GCN-LABEL: {{^}}preserve_condition_undef_flag:
 ; GCN-NOT: vcc
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[REG:s\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
Index: test/CodeGen/AMDGPU/collapse-endcf.ll
===================================================================
--- test/CodeGen/AMDGPU/collapse-endcf.ll
+++ test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -4,7 +4,7 @@
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF]]
-; GCN:      s_and_b64 exec, exec, vcc
+; GCN:      s_and_b64 exec, exec, [[REG:s\[[0-9]+:[0-9]+\]]]
 ; GCN-NEXT: ; mask branch [[ENDIF]]
 ; GCN-NEXT: {{^BB[0-9_]+}}:
 ; GCN:      store_dword
@@ -124,6 +124,7 @@
 ; GCN-NEXT: s_cbranch_execz [[THEN_OUTER]]
 ; GCN-NEXT: {{^BB[0-9_]+}}:
 ; GCN:      store_dword
+; GCN:      v_cmp_eq_u32_e64
 ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]]
 ; GCN-NEXT: ; mask branch [[THEN_OUTER_FLOW:BB[0-9_]+]]
 ; GCN-NEXT: {{^BB[0-9_]+}}:
@@ -210,7 +211,7 @@
 ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
 
 ; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: s_and_b64 exec, exec, vcc
+; GCN: s_and_b64 exec, exec, [[REG:s\[[0-9]+:[0-9]+\]]]
 
 ; GCN-NOT: s_or_b64 exec, exec
 
Index: test/CodeGen/AMDGPU/commute-compares.ll
===================================================================
--- test/CodeGen/AMDGPU/commute-compares.ll
+++ test/CodeGen/AMDGPU/commute-compares.ll
@@ -7,7 +7,7 @@
 ; --------------------------------------------------------------------------------
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
-; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -20,7 +20,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
-; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
+; GCN: v_cmp_ne_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -35,7 +35,7 @@
 ; FIXME: Why isn't this being folded as a constant?
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
-; GCN: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, [[K]]
+; GCN: v_cmp_ne_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, [[K]]
 define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -48,7 +48,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
-; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
+; GCN: v_cmp_lt_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -61,7 +61,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
-; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
+; GCN: v_cmp_lt_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 63, v{{[0-9]+}}
 define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -74,7 +74,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
-; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+; GCN: v_cmp_gt_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -87,7 +87,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
-; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+; GCN: v_cmp_gt_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -101,7 +101,7 @@
 
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, [[K]]
+; GCN: v_cmp_lt_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, [[K]]
 define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -114,7 +114,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
-; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
+; GCN: v_cmp_lt_i32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], -1, v{{[0-9]+}}
 define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -127,7 +127,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
-; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
+; GCN: v_cmp_lt_i32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], -3, v{{[0-9]+}}
 define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -140,7 +140,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
-; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
+; GCN: v_cmp_gt_i32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], -16, v{{[0-9]+}}
 define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -153,7 +153,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
-; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
+; GCN: v_cmp_gt_i32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 6, v{{[0-9]+}}
 define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -170,7 +170,7 @@
 ; --------------------------------------------------------------------------------
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
-; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_eq_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -183,7 +183,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
-; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_ne_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -196,7 +196,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
-; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_lt_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -209,7 +209,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
-; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_lt_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 63, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -222,7 +222,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
-; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -235,7 +235,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
-; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -251,7 +251,7 @@
 
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_u64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -264,7 +264,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
-; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_lt_i64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], -1, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -277,7 +277,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
-; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_lt_i64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], -3, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -290,7 +290,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
-; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_i64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], -16, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -303,7 +303,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
-; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_i64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 6, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -321,7 +321,7 @@
 
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
-; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_eq_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -335,7 +335,7 @@
 
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
-; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_lt_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -348,7 +348,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
-; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_le_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -361,7 +361,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
-; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_gt_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -374,7 +374,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
-; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_ge_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -387,7 +387,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
-; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_lg_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -400,7 +400,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
-; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+; GCN: v_cmp_o_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], [[REG:v[0-9]+]], [[REG]]
 define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -413,7 +413,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
-; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_nlg_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -426,7 +426,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
-; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_nge_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -439,7 +439,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
-; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_ngt_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -452,7 +452,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
-; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_nle_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -465,7 +465,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
-; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_nlt_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -478,7 +478,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
-; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+; GCN: v_cmp_neq_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -491,7 +491,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
-; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+; GCN: v_cmp_u_f32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], [[REG:v[0-9]+]], [[REG]]
 define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -509,7 +509,7 @@
 
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
-; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_eq_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -523,7 +523,7 @@
 
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
-; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_lt_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -536,7 +536,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
-; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_le_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -549,7 +549,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
-; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -562,7 +562,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
-; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_ge_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -575,7 +575,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
-; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_lg_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -588,7 +588,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
-; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+; GCN: v_cmp_o_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
 define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -601,7 +601,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
-; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_nlg_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -614,7 +614,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
-; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_nge_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -627,7 +627,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
-; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_ngt_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -640,7 +640,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
-; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_nle_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -653,7 +653,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
-; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_nlt_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -666,7 +666,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
-; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_neq_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -679,7 +679,7 @@
 }
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
-; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+; GCN: v_cmp_u_f64_e64 [[REG:s\[[0-9]+:[0-9]+\]]], [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
 define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -700,7 +700,7 @@
 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
-; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
+; GCN: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, [[FI]]
 define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
Index: test/CodeGen/AMDGPU/ctlz.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz.ll
+++ test/CodeGen/AMDGPU/ctlz.ll
@@ -19,9 +19,9 @@
 ; FUNC-LABEL: {{^}}s_ctlz_i32:
 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]]
-; GCN-DAG: v_cmp_ne_u32_e64 vcc, [[VAL]], 0{{$}}
+; GCN-DAG: v_cmp_ne_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]]
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[VCTLZ]], vcc
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 32, [[VCTLZ]], [[REG:s\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: s_endpgm
 
@@ -36,8 +36,8 @@
 ; FUNC-LABEL: {{^}}v_ctlz_i32:
 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
-; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
+; GCN: v_cmp_ne_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 32, [[CTLZ]], [[REG]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -106,10 +106,10 @@
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
-; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]]
+; SI: v_cmp_ne_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; VI: v_cmp_ne_u16_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
 
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 32, [[FFBH]], [[REG:s\[[0-9]+:[0-9]+\]]]
 
 ; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]]
 ; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]]
@@ -124,13 +124,13 @@
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64:
 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
+; GCN-DAG: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
 ; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
 ; GCN-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
-; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
+; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
 define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
@@ -149,14 +149,14 @@
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64:
 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc
+; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[REG]]
 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
-; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
-; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
+; GCN-DAG: v_cmp_ne_u32_e64 [[REG2:s\[[0-9]+:[0-9]+\]]], 0, [[OR]]
+; GCN-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], [[REG2]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
 define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -99,14 +99,14 @@
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
-; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
-; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
-; GCN-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
-; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
+; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
+; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
+; GCN: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], s[[HI]], 0{{$}}
+; GCN: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]], [[REG]]
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
 define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
@@ -124,11 +124,11 @@
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
+; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]], [[REG]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
 define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
@@ -200,8 +200,8 @@
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
-; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]]
-; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
+; GCN-DAG: v_cmp_eq_u32_e64 [[REG:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, [[REG]]
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
Index: test/CodeGen/AMDGPU/fcmp.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/fcmp.f16.ll
+++ test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -6,8 +6,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_lt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_lt_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -31,7 +31,7 @@
 ; SI:  v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
 ; SI:  v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
 
-; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI:  v_cmp_lt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
 ; VI:  v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
 
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
@@ -57,8 +57,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_eq_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -80,8 +80,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_le_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_le_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -103,8 +103,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_gt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_gt_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -126,8 +126,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_lg_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_lg_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -149,8 +149,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_ge_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_ge_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -172,8 +172,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_o_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_o_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -195,8 +195,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_u_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_u_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -218,8 +218,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_nge_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_nge_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -241,8 +241,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_nlg_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_nlg_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -264,8 +264,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_ngt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_ngt_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -287,8 +287,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_nle_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_nle_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -310,8 +310,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_neq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_neq_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -333,8 +333,8 @@
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; SI:  v_cmp_nlt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32]], v[[B_F32]]
+; VI:  v_cmp_nlt_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16]], v[[B_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
@@ -352,11 +352,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_lt:
-; SI: v_cmp_lt_f32_e32 vcc,
-; SI: v_cmp_lt_f32_e32 vcc,
+; SI: v_cmp_lt_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI: v_cmp_lt_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI: v_cmp_lt_f16_e32 vcc,
-; VI: v_cmp_lt_f16_e32 vcc,
+; VI: v_cmp_lt_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI: v_cmp_lt_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_lt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -371,11 +371,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_eq
-; SI:  v_cmp_eq_f32_e32 vcc,
-; SI:  v_cmp_eq_f32_e32 vcc,
+; SI:  v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI:  v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI:  v_cmp_eq_f16_e32 vcc,
-; VI:  v_cmp_eq_f16_e32 vcc,
+; VI:  v_cmp_eq_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI:  v_cmp_eq_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_eq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -390,10 +390,10 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_le:
-; SI:  v_cmp_le_f32_e32 vcc
-; SI:  v_cmp_le_f32_e32 vcc
-; VI:  v_cmp_le_f16_e32 vcc
-; VI:  v_cmp_le_f16_e32 vcc
+; SI:  v_cmp_le_f32_e64 {{s\[[0-9]+:[0-9]+\]}}
+; SI:  v_cmp_le_f32_e64 {{s\[[0-9]+:[0-9]+\]}}
+; VI:  v_cmp_le_f16_e64 {{s\[[0-9]+:[0-9]+\]}}
+; VI:  v_cmp_le_f16_e64 {{s\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fcmp_v2f16_le(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -408,11 +408,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_gt:
-; SI: v_cmp_gt_f32_e32 vcc,
-; SI: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI: v_cmp_gt_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI: v_cmp_gt_f16_e32 vcc,
-; VI: v_cmp_gt_f16_e32 vcc,
+; VI: v_cmp_gt_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI: v_cmp_gt_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_gt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -427,11 +427,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_lg:
-; SI: v_cmp_lg_f32_e32 vcc,
-; SI: v_cmp_lg_f32_e32 vcc,
+; SI: v_cmp_lg_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI: v_cmp_lg_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI: v_cmp_lg_f16_e32 vcc,
-; VI: v_cmp_lg_f16_e32 vcc,
+; VI: v_cmp_lg_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI: v_cmp_lg_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_lg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -446,11 +446,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_ge:
-; SI:  v_cmp_ge_f32_e32 vcc,
-; SI:  v_cmp_ge_f32_e32 vcc,
+; SI:  v_cmp_ge_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI:  v_cmp_ge_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI:  v_cmp_ge_f16_e32 vcc,
-; VI:  v_cmp_ge_f16_e32 vcc,
+; VI:  v_cmp_ge_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI:  v_cmp_ge_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_ge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -465,11 +465,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_o:
-; SI:  v_cmp_o_f32_e32 vcc,
-; SI:  v_cmp_o_f32_e32 vcc,
+; SI:  v_cmp_o_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI:  v_cmp_o_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI:  v_cmp_o_f16_e32 vcc,
-; VI:  v_cmp_o_f16_e32 vcc,
+; VI:  v_cmp_o_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI:  v_cmp_o_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_o(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -484,11 +484,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_u:
-; SI:  v_cmp_u_f32_e32 vcc,
-; SI:  v_cmp_u_f32_e32 vcc,
+; SI:  v_cmp_u_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI:  v_cmp_u_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI:  v_cmp_u_f16_e32 vcc,
-; VI:  v_cmp_u_f16_e32 vcc,
+; VI:  v_cmp_u_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI:  v_cmp_u_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_u(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -503,11 +503,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_nge
-; SI:  v_cmp_nge_f32_e32 vcc,
-; SI:  v_cmp_nge_f32_e32 vcc,
+; SI:  v_cmp_nge_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI:  v_cmp_nge_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI:  v_cmp_nge_f16_e32 vcc,
-; VI:  v_cmp_nge_f16_e32 vcc,
+; VI:  v_cmp_nge_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI:  v_cmp_nge_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_nge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -522,11 +522,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_nlg
-; SI:  v_cmp_nlg_f32_e32 vcc
-; SI:  v_cmp_nlg_f32_e32 vcc
+; SI:  v_cmp_nlg_f32_e64 {{s\[[0-9]+:[0-9]+\]}}
+; SI:  v_cmp_nlg_f32_e64 {{s\[[0-9]+:[0-9]+\]}}
 
-; VI:  v_cmp_nlg_f16_e32 vcc
-; VI:  v_cmp_nlg_f16_e32 vcc
+; VI:  v_cmp_nlg_f16_e64 {{s\[[0-9]+:[0-9]+\]}}
+; VI:  v_cmp_nlg_f16_e64 {{s\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fcmp_v2f16_nlg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -541,11 +541,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_ngt
-; SI:  v_cmp_ngt_f32_e32 vcc,
-; SI:  v_cmp_ngt_f32_e32 vcc,
+; SI:  v_cmp_ngt_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
+; SI:  v_cmp_ngt_f32_e64 {{s\[[0-9]+:[0-9]+\]}},
 
-; VI:  v_cmp_ngt_f16_e32 vcc,
-; VI:  v_cmp_ngt_f16_e32 vcc,
+; VI:  v_cmp_ngt_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
+; VI:  v_cmp_ngt_f16_e64 {{s\[[0-9]+:[0-9]+\]}},
 define amdgpu_kernel void @fcmp_v2f16_ngt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -560,11 +560,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_nle
-; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp_nle_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp_nle_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_nle_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_nle_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @fcmp_v2f16_nle(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -579,11 +579,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_v2f16_neq
-; SI:  v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_cmp_neq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_cmp_neq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI:  v_cmp_neq_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI:  v_cmp_neq_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @fcmp_v2f16_neq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -603,16 +603,16 @@
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG:  v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_cmp_nlt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32_0]], v[[B_F32_0]]
 
 ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG:  v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
-; VI-DAG:  v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
+; SI-DAG:  v_cmp_nlt_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32_1]], v[[B_F32_1]]
+; VI-DAG:  v_cmp_nlt_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_V2_F16]], v[[B_V2_F16]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
 
-; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
+; VI:  v_cmp_nlt_f16_e64 {{s\[[0-9]+:[0-9]+\]}}, v[[A_F16_1]], v[[B_F16_1]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
Index: test/CodeGen/SystemZ/call-03.ll
===================================================================
--- test/CodeGen/SystemZ/call-03.ll
+++ test/CodeGen/SystemZ/call-03.ll
@@ -62,16 +62,13 @@
 
 ; Check an indirect call.  In this case the only acceptable choice for
 ; the target register is %r1.
-;
-; NOTE: the extra copy 'lgr %r1, %r0' is a coalescing failure.
 define void @f5(void(i32, i32, i32, i32) *%foo) {
 ; CHECK-LABEL: f5:
-; CHECK: lgr %r0, %r2
+; CHECK: lgr %r1, %r2
 ; CHECK-DAG: lhi %r2, 1
 ; CHECK-DAG: lhi %r3, 2
 ; CHECK-DAG: lhi %r4, 3
 ; CHECK-DAG: lhi %r5, 4
-; CHECK: lgr %r1, %r0
 ; CHECK: br %r1
   tail call void %foo(i32 1, i32 2, i32 3, i32 4)
   ret void
Index: test/CodeGen/SystemZ/swift-return.ll
===================================================================
--- test/CodeGen/SystemZ/swift-return.ll
+++ test/CodeGen/SystemZ/swift-return.ll
@@ -39,9 +39,8 @@
 ; in memroy. The caller provides space for the return value and passes
 ; the address in %r2. The first input argument will be in %r3.
 ; CHECK-LABEL: test2:
-; CHECK: lr %[[REG1:r[0-9]+]], %r2
+; CHECK: lr %r3, %r2
 ; CHECK-DAG: la %r2, 160(%r15)
-; CHECK-DAG: lr %r3, %[[REG1]]
 ; CHECK: brasl %r14, gen2
 ; CHECK: l %r2, 160(%r15)
 ; CHECK: a %r2, 164(%r15)
Index: test/CodeGen/SystemZ/swifterror.ll
===================================================================
--- test/CodeGen/SystemZ/swifterror.ll
+++ test/CodeGen/SystemZ/swifterror.ll
@@ -34,11 +34,11 @@
 ; CHECK: lgr %r[[REG1:[0-9]+]], %r2
 ; CHECK: lghi %r9, 0
 ; CHECK: brasl %r14, foo
-; CHECK: cgijlh %r9, 0,
+; CHECK: %r2, %r9
+; CHECK: jlh
 ; Access part of the error object and save it to error_ref
-; CHECK: lb %r[[REG2:[0-9]+]], 8(%r9)
+; CHECK: lb %r[[REG2:[0-9]+]], 8(%r2)
 ; CHECK: stc %r[[REG2]], 0(%r[[REG1]])
-; CHECK: lgr %r2, %r9
 ; CHECK: brasl %r14, free
 ; CHECK-O0-LABEL: caller:
 ; CHECK-O0: lghi %r9, 0
@@ -246,11 +246,10 @@
 ; CHECK: lhi %r3, 1
 ; CHECK: lghi %r9, 0
 ; CHECK: brasl %r14, foo_sret
-; CHECK: cgijlh %r9, 0,
+; CHECK: jlh
 ; Access part of the error object and save it to error_ref
-; CHECK: lb %r0, 8(%r9)
+; CHECK: lb %r0, 8(%r2)
 ; CHECK: stc %r0, 0(%r[[REG1]])
-; CHECK: lgr %r2, %r9
 ; CHECK: brasl %r14, free
 
 ; CHECK-O0-LABEL: caller3:
@@ -296,21 +295,21 @@
 ; The first swifterror value:
 ; CHECK: lghi %r9, 0
 ; CHECK: brasl %r14, foo
-; CHECK: cgijlh %r9, 0,
+; CHECK: ltgr %r2, %r9
+; CHECK: jlh
 ; Access part of the error object and save it to error_ref
-; CHECK: lb %r0, 8(%r9)
+; CHECK: lb %r0, 8(%r2)
 ; CHECK: stc %r0, 0(%r[[REG1]])
-; CHECK: lgr %r2, %r9
 ; CHECK: brasl %r14, free
 
 ; The second swifterror value:
 ; CHECK: lghi %r9, 0
 ; CHECK: brasl %r14, foo
-; CHECK: cgijlh %r9, 0,
+; CHECK: ltgr %r2, %r9
+; CHECK: jlh
 ; Access part of the error object and save it to error_ref
-; CHECK: lb %r0, 8(%r9)
+; CHECK: lb %r0, 8(%r2)
 ; CHECK: stc %r0, 0(%r[[REG2]])
-; CHECK: lgr %r2, %r9
 ; CHECK: brasl %r14, free
 
 ; CHECK-O0-LABEL: caller_with_multiple_swifterror_values: