diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4625,10 +4625,9 @@
           .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
-          .addDef(Def)
-          .addUse(Use)
-          .addMBB(UncondBrTarget)
-          .addImm(0);
+            .addDef(Def)
+            .addUse(Use)
+            .addMBB(UncondBrTarget);
       }
 
       if (Br) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -325,7 +325,7 @@
 
 def SI_ELSE : CFPseudoInstSI <
   (outs SReg_1:$dst),
-  (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
   let Size = 12;
   let hasSideEffects = 1;
 }
@@ -745,7 +745,7 @@
 
 def : GCNPat<
   (AMDGPUelse i1:$src, bb:$target),
-  (SI_ELSE $src, $target, 0)
+  (SI_ELSE $src, $target)
 >;
 
 def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -333,13 +333,11 @@
 
   Register DstReg = MI.getOperand(0).getReg();
 
-  bool ExecModified = MI.getOperand(3).getImm() != 0;
   MachineBasicBlock::iterator Start = MBB.begin();
 
   // This must be inserted before phis and any spill code inserted before the
   // else.
-  Register SaveReg = ExecModified ?
-    MRI->createVirtualRegister(BoolRC) : DstReg;
+  Register SaveReg = MRI->createVirtualRegister(BoolRC);
   MachineInstr *OrSaveExec =
     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
     .add(MI.getOperand(1)); // Saved EXEC
@@ -348,15 +346,14 @@
 
   MachineBasicBlock::iterator ElsePt(MI);
 
-  if (ExecModified) {
-    MachineInstr *And =
-      BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
-      .addReg(Exec)
-      .addReg(SaveReg);
+  // This accounts for any modification of the EXEC mask within the block and
+  // can be optimized out pre-RA when not required.
+  MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
+                          .addReg(Exec)
+                          .addReg(SaveReg);
 
-    if (LIS)
-      LIS->InsertMachineInstrInMaps(*And);
-  }
+  if (LIS)
+    LIS->InsertMachineInstrInMaps(*And);
 
   MachineInstr *Xor =
     BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
@@ -386,8 +383,7 @@
 
   LIS->removeInterval(DstReg);
   LIS->createAndComputeVirtRegInterval(DstReg);
-  if (ExecModified)
-    LIS->createAndComputeVirtRegInterval(SaveReg);
+  LIS->createAndComputeVirtRegInterval(SaveReg);
 
   // Let this be recomputed.
   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -35,10 +35,13 @@
 
   unsigned AndOpc;
   unsigned Andn2Opc;
+  unsigned OrSaveExecOpc;
+  unsigned XorTermrOpc;
   Register CondReg;
   Register ExecReg;
 
   Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
+  bool optimizeElseBranch(MachineBasicBlock &MBB);
 
 public:
   static char ID;
@@ -224,6 +227,81 @@
   return CCReg;
 }
 
+// Optimize sequence
+//    %dst = S_OR_SAVEEXEC %src
+//    ... instructions not modifying exec ...
+//    %tmp = S_AND $exec, %dst
+//    $exec = S_XOR_term $exec, %tmp
+// =>
+//    %dst = S_OR_SAVEEXEC %src
+//    ... instructions not modifying exec ...
+//    $exec = S_XOR_term $exec, %dst
+//
+// Clean up potentially unnecessary code added for safety during
+// control flow lowering.
+//
+// Return whether any changes were made to MBB.
+bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
+  if (MBB.empty())
+    return false;
+
+  // Check this is an else block.
+  auto First = MBB.begin();
+  MachineInstr &SaveExecMI = *First;
+  if (SaveExecMI.getOpcode() != OrSaveExecOpc)
+    return false;
+
+  auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
+    return MI.getOpcode() == XorTermrOpc;
+  });
+  if (I == MBB.terminators().end())
+    return false;
+
+  MachineInstr &XorTermMI = *I;
+  if (XorTermMI.getOperand(1).getReg() != ExecReg)
+    return false;
+
+  Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
+  Register DstReg = XorTermMI.getOperand(2).getReg();
+
+  // Find potentially unnecessary S_AND
+  MachineInstr *AndExecMI = nullptr;
+  I--;
+  while (I != First && !AndExecMI) {
+    if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
+        I->getOperand(1).getReg() == ExecReg)
+      AndExecMI = &*I;
+    I--;
+  }
+  if (!AndExecMI)
+    return false;
+
+  // Check for exec modifying instructions.
+  // Note: exec defs do not create live ranges beyond the
+  // instruction so isDefBetween cannot be used.
+  // Instead just check that the def segments are adjacent.
+  SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
+  SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
+  for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
+    LiveRange &RegUnit = LIS->getRegUnit(*UI);
+    if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
+      return false;
+  }
+
+  // Remove unnecessary S_AND
+  LIS->removeInterval(SavedExecReg);
+  LIS->removeInterval(DstReg);
+
+  SaveExecMI.getOperand(0).setReg(DstReg);
+
+  LIS->RemoveMachineInstrFromMaps(*AndExecMI);
+  AndExecMI->eraseFromParent();
+
+  LIS->createAndComputeVirtRegInterval(DstReg);
+
+  return true;
+}
+
 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -237,6 +315,9 @@
   const bool Wave32 = ST.isWave32();
   AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
   Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+  OrSaveExecOpc =
+      Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+  XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
   CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
   ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
@@ -245,6 +326,11 @@
 
   for (MachineBasicBlock &MBB : MF) {
 
+    if (optimizeElseBranch(MBB)) {
+      RecalcRegs.insert(AMDGPU::SCC);
+      Changed = true;
+    }
+
     if (Register Reg = optimizeVcndVcmpPair(MBB)) {
       RecalcRegs.insert(Reg);
       RecalcRegs.insert(AMDGPU::VCC_LO);
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -730,9 +730,6 @@
       if (MI.isTerminator() && OutNeeds == StateExact)
         Needs = StateExact;
 
-      if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
-        MI.getOperand(3).setImm(1);
-
       ++Next;
     } else {
       // End of basic block
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
@@ -140,7 +140,7 @@
   ; WAVE64:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE64:   [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64:   [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64:   G_BR %bb.1
   ; WAVE64: bb.1:
   ; WAVE32-LABEL: name: brcond_si_else
@@ -149,7 +149,7 @@
   ; WAVE32:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE32:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE32:   G_BR %bb.1
   ; WAVE32: bb.1:
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -475,13 +475,14 @@
   ; GCN: bb.2:
   ; GCN:   successors: %bb.3(0x40000000), %bb.6(0x40000000)
   ; GCN:   [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GCN:   $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; GCN:   $exec = S_XOR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc
   ; GCN:   S_CBRANCH_EXECZ %bb.6, implicit $exec
   ; GCN: bb.3:
   ; GCN:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GCN:   [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
-  ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc
-  ; GCN:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
+  ; GCN:   [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc
+  ; GCN:   $exec = S_MOV_B64_term killed [[S_AND_B64_2]]
   ; GCN:   S_CBRANCH_EXECZ %bb.3, implicit $exec
   ; GCN: bb.4:
   ; GCN:   successors: %bb.5(0x80000000)
@@ -489,7 +490,7 @@
   ; GCN:   successors: %bb.6(0x80000000)
   ; GCN:   $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc
   ; GCN: bb.6:
-  ; GCN:   $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; GCN:   $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc
   ; GCN:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -502,7 +503,7 @@
 
   bb.2:
     successors: %bb.3, %bb.6
-    %2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.3:
     successors: %bb.3, %bb.4
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -196,19 +196,20 @@
 
 ; Regular spill value restored after exec modification
 ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+; Followed by spill
+; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
+; GCN: s_and_b64 s{{\[}}[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]{{\]}}, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
 
 ; Spill saved exec
-; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
-; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
-
+; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
+; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
 
-; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], 0
-; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], 1
+; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0
+; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1
 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
-; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
+; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}}
 ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
@@ -100,11 +100,12 @@
   ; CHECK: bb.0:
   ; CHECK:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5
-  ; CHECK:   [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK:   [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec
-  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_B64_]], implicit-def $scc
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec
   ; CHECK:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK:   S_BRANCH %bb.2
@@ -120,7 +121,7 @@
     %0:vgpr_32 = COPY killed $vgpr0
     %1:sreg_64_xexec = COPY $sgpr4_sgpr5
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec
-    %3:sreg_64_xexec = SI_ELSE %2, %bb.1, 0, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_ELSE %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec
     S_BRANCH %bb.2