diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4025,77 +4025,6 @@
     MI.eraseFromParent();
     return BB;
   }
-  case AMDGPU::SI_INIT_EXEC:
-    // This should be before all vector instructions.
-    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
-            AMDGPU::EXEC)
-        .addImm(MI.getOperand(0).getImm());
-    MI.eraseFromParent();
-    return BB;
-
-  case AMDGPU::SI_INIT_EXEC_LO:
-    // This should be before all vector instructions.
-    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
-            AMDGPU::EXEC_LO)
-        .addImm(MI.getOperand(0).getImm());
-    MI.eraseFromParent();
-    return BB;
-
-  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
-    // Extract the thread count from an SGPR input and set EXEC accordingly.
-    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
-    //
-    // S_BFE_U32 count, input, {shift, 7}
-    // S_BFM_B64 exec, count, 0
-    // S_CMP_EQ_U32 count, 64
-    // S_CMOV_B64 exec, -1
-    MachineInstr *FirstMI = &*BB->begin();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    Register InputReg = MI.getOperand(0).getReg();
-    Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    bool Found = false;
-
-    // Move the COPY of the input reg to the beginning, so that we can use it.
-    for (auto I = BB->begin(); I != &MI; I++) {
-      if (I->getOpcode() != TargetOpcode::COPY ||
-          I->getOperand(0).getReg() != InputReg)
-        continue;
-
-      if (I == FirstMI) {
-        FirstMI = &*++BB->begin();
-      } else {
-        I->removeFromParent();
-        BB->insert(FirstMI, &*I);
-      }
-      Found = true;
-      break;
-    }
-    assert(Found);
-    (void)Found;
-
-    // This should be before all vector instructions.
-    unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
-    bool isWave32 = getSubtarget()->isWave32();
-    unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
-        .addReg(InputReg)
-        .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
-    BuildMI(*BB, FirstMI, DebugLoc(),
-            TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
-            Exec)
-        .addReg(CountReg)
-        .addImm(0);
-    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
-        .addReg(CountReg, RegState::Kill)
-        .addImm(getSubtarget()->getWavefrontSize());
-    BuildMI(*BB, FirstMI, DebugLoc(),
-            TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
-            Exec)
-        .addImm(-1);
-    MI.eraseFromParent();
-    return BB;
-  }
-
   case AMDGPU::GET_GROUPSTATICSIZE: {
     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1860,6 +1860,83 @@
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
+  case AMDGPU::SI_INIT_EXEC: {
+    // This should be before all vector instructions.
+    const bool isWave32 = ST.isWave32();
+    BuildMI(MBB, MBB.begin(), MI.getDebugLoc(),
+            get(isWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+            isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
+        .addImm(MI.getOperand(0).getImm());
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
+    // Extract the thread count from an SGPR input and set EXEC accordingly.
+    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+    //
+    // S_BFE_U32 count, input, {shift, 7}
+    // S_BFM_B64 exec, count, 0
+    // S_CMP_EQ_U32 count, 64
+    // S_CMOV_B64 exec, -1
+    MachineFunction &MF = *MBB.getParent();
+    MachineInstr *FirstMI = nullptr;
+    Register InputReg = MI.getOperand(0).getReg();
+    bool isInputRead = false;
+
+    // Find definition of InputReg.
+    // Usually InputReg is a function live in, so there will not be one.
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MI.getIterator();
+         I != E && !FirstMI; ++I) {
+      isInputRead = isInputRead || I->readsRegister(InputReg);
+      if (I->definesRegister(InputReg))
+        FirstMI = &*(std::next(I));
+    }
+
+    // Select a temporary register to hold the count.
+    Register CountReg;
+    if (!isInputRead && MI.getOperand(0).isKill()) {
+      // InputReg only used for init -> reuse input register
+      CountReg = InputReg;
+    } else if (!FirstMI && (MF.begin() == MBB.getIterator())) {
+      // Insert at function start -> all non live-ins available
+      // Use VCC as that is always available.
+      CountReg = AMDGPU::VCC_LO;
+    } else {
+      // Rare edge case -> try to find a free register
+      RegScavenger RS;
+      RS.enterBasicBlock(MBB);
+      if (FirstMI)
+        RS.forward(FirstMI);
+      CountReg = RS.FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+    }
+    if (!CountReg)
+      report_fatal_error("Cannot find register to build EXEC init mask");
+
+    // Update insertion point
+    if (!FirstMI)
+      FirstMI = &*MBB.begin();
+
+    // Insert initialisation sequence
+    const DebugLoc DL = MI.getDebugLoc();
+    const unsigned Mask = (ST.getWavefrontSize() << 1) - 1;
+    const bool isWave32 = ST.isWave32();
+    const Register Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    BuildMI(MBB, FirstMI, DL, get(AMDGPU::S_BFE_U32), CountReg)
+        .addReg(InputReg)
+        .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+    BuildMI(MBB, FirstMI, DL,
+            get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+        .addReg(CountReg)
+        .addImm(0);
+    BuildMI(MBB, FirstMI, DL, get(AMDGPU::S_CMP_EQ_U32))
+        .addReg(CountReg, RegState::Kill)
+        .addImm(ST.getWavefrontSize());
+    BuildMI(MBB, FirstMI, DL,
+            get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), Exec)
+        .addImm(-1);
+    MI.eraseFromParent();
+    break;
+  }
   }
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -399,32 +399,13 @@
   (outs), (ins i64imm:$src),
   [(int_amdgcn_init_exec (i64 timm:$src))]> {
   let Defs = [EXEC];
-  let usesCustomInserter = 1;
-  let isAsCheapAsAMove = 1;
-  let WaveSizePredicate = isWave64;
-}
-
-// FIXME: Intrinsic should be mangled for wave size.
-def SI_INIT_EXEC_LO : SPseudoInstSI <
-  (outs), (ins i32imm:$src), []> {
-  let Defs = [EXEC_LO];
-  let usesCustomInserter = 1;
   let isAsCheapAsAMove = 1;
-  let WaveSizePredicate = isWave32;
 }
 
-// FIXME: Wave32 version
 def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
   (outs), (ins SSrc_b32:$input, i32imm:$shift),
   [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
   let Defs = [EXEC];
-  let usesCustomInserter = 1;
-}
-
-def : GCNPat <
-  (int_amdgcn_init_exec timm:$src),
-  (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
-  let WaveSizePredicate = isWave32;
 }
 
 // Return for returning shaders to a shader variant epilog.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -22,9 +22,9 @@
 }
 
 ; GCN-LABEL: {{^}}input_s3off8:
-; GCN: s_bfe_u32 s0, s3, 0x70008
-; GCN: s_bfm_b64 exec, s0, 0
-; GCN: s_cmp_eq_u32 s0, 64
+; GCN: s_bfe_u32 s3, s3, 0x70008
+; GCN: s_bfm_b64 exec, s3, 0
+; GCN: s_cmp_eq_u32 s3, 64
 ; GCN: s_cmov_b64 exec, -1
 ; GCN: v_add_f32_e32 v0,
 define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
@@ -48,9 +48,9 @@
 }
 
 ; GCN-LABEL: {{^}}reuse_input:
-; GCN: s_bfe_u32 s1, s0, 0x70013
-; GCN: s_bfm_b64 exec, s1, 0
-; GCN: s_cmp_eq_u32 s1, 64
+; GCN: s_bfe_u32 vcc_lo, s0, 0x70013
+; GCN: s_bfm_b64 exec, vcc_lo, 0
+; GCN: s_cmp_eq_u32 vcc_lo, 64
 ; GCN: s_cmov_b64 exec, -1
 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
 define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
@@ -62,9 +62,9 @@
 }
 
 ; GCN-LABEL: {{^}}reuse_input2:
-; GCN: s_bfe_u32 s1, s0, 0x70013
-; GCN: s_bfm_b64 exec, s1, 0
-; GCN: s_cmp_eq_u32 s1, 64
+; GCN: s_bfe_u32 vcc_lo, s0, 0x70013
+; GCN: s_bfm_b64 exec, vcc_lo, 0
+; GCN: s_cmp_eq_u32 vcc_lo, 64
 ; GCN: s_cmov_b64 exec, -1
 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
 define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
@@ -84,6 +84,116 @@
   unreachable
 }
 
+; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
+; GCN-NOT: {{^}}v_
+; GCN: s_mov_b64 exec, -1
+; GCN: v_mov
+; GCN: v_add
+define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
+main_body:
+  %array0 = alloca [1024 x i32], align 16, addrspace(5)
+  %array1 = alloca [20 x i32], align 16, addrspace(5)
+  call void @llvm.amdgcn.init.exec(i64 -1)
+
+  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
+  store i32 %a, i32 addrspace(5)* %ptr0, align 4
+
+  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
+  store i32 %a, i32 addrspace(5)* %ptr1, align 4
+
+  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
+  store i32 %b, i32 addrspace(5)* %ptr2, align 4
+
+  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
+  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+
+  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
+  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+
+  %v5 = add i32 %v3, %v4
+  %v = bitcast i32 %v5 to float
+  ret float %v
+}
+
+; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
+; GCN-NOT: {{^}}v_
+; GCN: s_bfe_u32 s2, s2, 0x70008
+; GCN: s_bfm_b64 exec, s2, 0
+; GCN: s_cmp_eq_u32 s2, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_mov
+; GCN: v_add
+define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
+main_body:
+  %array0 = alloca [1024 x i32], align 16, addrspace(5)
+  %array1 = alloca [20 x i32], align 16, addrspace(5)
+  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
+
+  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
+  store i32 %a, i32 addrspace(5)* %ptr0, align 4
+
+  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
+  store i32 %a, i32 addrspace(5)* %ptr1, align 4
+
+  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
+  store i32 %b, i32 addrspace(5)* %ptr2, align 4
+
+  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
+  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+
+  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
+  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+
+  %v5 = add i32 %v3, %v4
+  %v = bitcast i32 %v5 to float
+  ret float %v
+}
+
+; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_scavenge:
+; GCN-NOT: {{^}}v_
+; GCN: %endif
+; GCN: s_bfe_u32 s3, s2, 0x70008
+; GCN: s_bfm_b64 exec, s3, 0
+; GCN: s_cmp_eq_u32 s3, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_mov
+; GCN: v_add
+define amdgpu_ps float @init_exec_input_before_frame_materialize_scavenge(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
+main_body:
+  %cc = icmp uge i32 %count, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  call void asm sideeffect "", ""()
+  br label %endif
+
+endif:
+  %array0 = alloca [1024 x i32], align 16, addrspace(5)
+  %array1 = alloca [20 x i32], align 16, addrspace(5)
+
+  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
+
+  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
+  store i32 %a, i32 addrspace(5)* %ptr0, align 4
+
+  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
+  store i32 %a, i32 addrspace(5)* %ptr1, align 4
+
+  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
+  store i32 %b, i32 addrspace(5)* %ptr2, align 4
+
+  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
+  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+
+  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
+  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+
+  %v5 = add i32 %v3, %v4
+  %v6 = add i32 %v5, %count
+  %v = bitcast i32 %v6 to float
+  ret float %v
+}
+
 declare void @llvm.amdgcn.init.exec(i64) #1
 declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
@@ -12,9 +12,9 @@
 }
 
 ; GCN-LABEL: {{^}}test_init_exec_from_input:
-; GCN: s_bfe_u32 s0, s3, 0x70008
-; GFX1032: s_bfm_b32 exec_lo, s0, 0
-; GFX1032: s_cmp_eq_u32 s0, 32
+; GCN: s_bfe_u32 s3, s3, 0x70008
+; GFX1032: s_bfm_b32 exec_lo, s3, 0
+; GFX1032: s_cmp_eq_u32 s3, 32
 ; GFX1032: s_cmov_b32 exec_lo, -1
 ; GFX1064: s_bfm_b64 exec, s0, 0
 ; GFX1064: s_cmp_eq_u32 s0, 64