diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -893,7 +893,7 @@
     return false;
 
   // V_NOP will be discarded by SQ.
-  // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+  // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
   // which is always a VGPR and available.
   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
   Register Reg = Src0->getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1258,6 +1258,10 @@
   LLVM_READONLY
   int getMFMAEarlyClobberOp(uint16_t Opcode);
 
+  /// \returns v_cmpx version of a v_cmp instruction.
+  LLVM_READONLY
+  int getVCMPXOpFromVCMP(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2596,6 +2596,15 @@
   let ValueCols = [["0"]];
 }
 
+// Maps an v_cmp instruction to its v_cmpx equivalent.
+def getVCMPXOpFromVCMP : InstrMapping {
+  let FilterClass = "VCMPVCMPXTable";
+  let RowFields = ["VCMPOp"];
+  let ColFields = ["IsVCMPX"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -292,6 +292,148 @@
   return false;
 }
 
+// Backwards-iterate from Origin (for n iterations) until either the beginning
+// of the BB is reached or Pred evaluates to true - which can be an arbitrary
+// condition based on the current MachineInstr, for instance an target
+// instruction. Breaks prematurely by returning nullptr if DisallowDefBetween is
+// true and one of the registers given in NonModifiableRegs is modified by the
+// current instruction.
+static MachineInstr *findInstrBackwards(
+    MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
+    SmallVector<MCRegister, 1> &NonModifiableRegs,
+    bool DisallowDefBetween = true, unsigned MaxInstructions = 5) {
+  MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
+                                      E = Origin.getParent()->rend();
+  unsigned CurrentIteration = 0;
+
+  for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+    bool PredResult = Pred(&*A);
+
+    if (!PredResult) {
+      if (DisallowDefBetween)
+        for (MCRegister Reg : NonModifiableRegs)
+          if (A->modifiesRegister(Reg))
+            return nullptr;
+
+      ++CurrentIteration;
+      continue;
+    }
+
+    return &*A;
+  }
+
+  return nullptr;
+}
+
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if there is no write to any of the v_cmp input
+// operands and no write to exec inbetween.
+static MachineInstr *findPossibleVCMPVCMPXOptimization(
+    MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
+    const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
+
+  MachineInstr *VCmp = nullptr;
+
+  Register SaveExecDest = SaveExec.getOperand(0).getReg();
+  if (SaveExec.getOperand(0).getSubReg() || !TRI->isSGPRReg(MRI, SaveExecDest))
+    return nullptr;
+
+  MachineOperand *SaveExecSrc0 =
+      TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+  if (!SaveExecSrc0->isReg() || SaveExecSrc0->getSubReg())
+    return nullptr;
+
+  SmallVector<MCRegister, 1> NonDefRegs;
+  NonDefRegs.push_back(SaveExecSrc0->getReg());
+  // Try to find the last v_cmp instruction that defs the saveexec input
+  // operand.
+  VCmp = findInstrBackwards(
+      SaveExec,
+      [](MachineInstr *Check) {
+        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1;
+      },
+      NonDefRegs);
+
+  if (!VCmp)
+    return nullptr;
+
+  // Try to determine if there is either a write to Exec or one of the VCmp
+  // operands between the saveexec and the vcmp.
+  // In the first case, the transformation does not make sense.
+  // In the second case, additional VGPR spilling might need to be inserted
+  // which might not be worth it.
+  // In either case, don't replace the instruction sequence.
+  NonDefRegs.clear();
+  NonDefRegs.push_back(Exec);
+
+  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+
+  if (!Src0 || !Src1)
+    return nullptr;
+
+  if (Src0->isReg() && !Src0->getSubReg())
+    NonDefRegs.push_back(Src0->getReg());
+
+  if (Src1->isReg() && !Src1->getSubReg())
+    NonDefRegs.push_back(Src1->getReg());
+
+  if (!findInstrBackwards(
+          SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+          NonDefRegs))
+    return nullptr;
+
+  return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 s*, exec_lo / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+                                         MachineInstr &VCmp, MCRegister Exec,
+                                         const SIInstrInfo *TII,
+                                         MachineRegisterInfo &MRI) {
+  const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+  if (NewOpcode == -1)
+    return false;
+
+  MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+  Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+  if (SaveExecInstr.getOperand(0).getSubReg())
+    return false;
+
+  MachineBasicBlock::instr_iterator InsertPosIt = VCmp.getIterator();
+  if (!SaveExecInstr.uses().empty())
+    BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+            SaveExecInstr.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), MoveDest)
+        .addReg(Exec);
+
+  // Omit dst as V_CMPX is implicitly writing to EXEC.
+  // Add src modifiers, if needed.
+  auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+                         VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
+      -1)
+    Builder.addImm(0);
+
+  Builder.add(*Src0);
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
+      -1)
+    Builder.addImm(0);
+
+  Builder.add(*Src1);
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
+    Builder.addImm(0);
+
+  return true;
+}
+
 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -299,6 +441,7 @@
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
   // Optimize sequences emitted for control flow lowering. They are originally
@@ -458,6 +601,42 @@
     }
   }
 
-  return true;
+  // After all s_op_saveexec instructions are inserted,
+  // replace (on GFX10.3 and later)
+  // v_cmp_* SGPR, IMM, VGPR
+  // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+  // with
+  // s_mov_b32 EXEC_SGPR_DEST, exec_lo
+  // v_cmpx_* IMM, VGPR
+  // to reduce pipeline stalls.
+  if (ST.hasGFX10_3Insts()) {
+    DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+    const unsigned AndSaveExecOpcode =
+        ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        // Try to record existing s_and_saveexec instructions, iff
+        // they are reading from a v_cmp dest SGPR write.
+        if (MI.getOpcode() != AndSaveExecOpcode)
+          continue;
 
+        if (MachineInstr *VCmp =
+                findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
+          SaveExecVCmpMapping[&MI] = &*VCmp;
+      }
+    }
+
+    for (const auto &Entry : SaveExecVCmpMapping) {
+      MachineInstr *SaveExecInstr = Entry.getFirst();
+      MachineInstr *VCmpInstr = Entry.getSecond();
+
+      if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, *MRI)) {
+        SaveExecInstr->eraseFromParent();
+        VCmpInstr->eraseFromParent();
+      }
+    }
+  }
+
+  return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -734,7 +734,13 @@
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
-        Register DstReg = MI.getOperand(0).getReg();
+        // Exclude VOPCX instructions as these don't write explicitly write a
+        // dst.
+        MachineOperand &Op0 = MI.getOperand(0);
+        if (!Op0.isReg() || !Op0.isDef())
+          continue;
+
+        Register DstReg = Op0.getReg();
         if (DstReg.isVirtual()) {
           // VOPC instructions can only write to the VCC register. We can't
           // force them to use VCC here, because this is only one register and
@@ -744,7 +750,7 @@
           // So, instead of forcing the instruction to write to VCC, we provide
           // a hint to the register allocator to use VCC and then we will run
           // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
+          MRI.setRegAllocationHint(Op0.getReg(), 0, VCCReg);
           continue;
         }
         if (DstReg != VCCReg)
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -205,6 +205,11 @@
   string NoSDstOp = Name;
 }
 
+class VCMPVCMPXTable <string Name> {
+  bit IsVCMPX = 0;
+  string VCMPOp = Name;
+}
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          SDPatternOperator cond = COND_NULL,
@@ -213,7 +218,8 @@
 
   def _e32 : VOPC_Pseudo <opName, P>,
              Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<1, opName#"_e32"> {
+             VCMPXNoSDstTable<1, opName#"_e32">,
+             VCMPVCMPXTable<opName#"_e32"> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
@@ -223,7 +229,8 @@
 
   def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<1, opName#"_e64"> {
+    VCMPXNoSDstTable<1, opName#"_e64">,
+    VCMPVCMPXTable<opName#"_e64"> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
     let isCompare = 1;
@@ -248,23 +255,29 @@
 
   def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
              Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<0, opName#"_e32"> {
+             VCMPXNoSDstTable<0, opName#"_e32">,
+             VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isConvergent = 1;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
+    // If the result of the substitution is not equal to the original
+    // opName, this is likely to be a v_cmpx instruction.
+    let IsVCMPX = !ne(!subst("v_cmpx", "v_cmp", opName), opName);
   }
 
   def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
     Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<0, opName#"_e64"> {
+    VCMPXNoSDstTable<0, opName#"_e64">,
+    VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = !ne(!subst("v_cmpx", "v_cmp", opName), opName);
   }
 
   foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
@@ -915,7 +928,7 @@
       def _e64_gfx10 :
         VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
         VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
-          let Inst{7-0} = ?; // sdst
+          let Inst{7-0} = 0x7e; // sdst
           let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
                           # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
         }
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
 
@@ -6,7 +7,7 @@
 
 ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
 ; GFX1030: s_cmp_lg_u32
-; GFX1030-NEXT: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
 
 ; GFX1010: s_cmp_lg_u32
 ; GFX1010-NEXT: s_cbranch_scc0  [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
@@ -51,9 +52,9 @@
 }
 
 ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
-; GFX1030: v_cmp_eq_u32
-; GFX1030: s_and_saveexec_b32
-; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_mov_b32
+; GFX1030: v_cmpx_eq_u32
+; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
 
 ; GFX1010: v_cmp_eq_u32
 ; GFX1010: s_and_saveexec_b32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -0,0 +1,110 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e64 15, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
+entry:
+  %bc = icmp slt i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 17, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_gt(i32 %x) {
+entry:
+  %bc = icmp sgt i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq:
+; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_ne_u32_e64 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_eq(i32 %x) {
+entry:
+  %bc = icmp eq i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne:
+; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_eq_u32_e64 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ne(i32 %x) {
+entry:
+  %bc = icmp ne i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e64 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_le(i32 %x) {
+entry:
+  %bc = icmp sle i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ge(i32 %x) {
+entry:
+  %bc = icmp sge i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1248,8 +1248,8 @@
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_ne_u32_e64 0, v1
 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
@@ -1327,8 +1327,8 @@
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_ne_u32_e64 0, v1
 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
@@ -1505,12 +1505,12 @@
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e64 0, v1
 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr1
@@ -1575,8 +1575,8 @@
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_eq_u32_e64 0, v1
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
@@ -2954,9 +2954,9 @@
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
-; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_eq_u32_e64 0, v1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
 ; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo