Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -108,6 +108,21 @@
   GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;
 
+// Set EXEC to the 64-bit value given.
+// This is always moved to the beginning of the basic block.
+def int_amdgcn_init_exec : Intrinsic<[],
+  [llvm_i64_ty],      // 64-bit literal constant
+  [IntrConvergent]>;
+
+// Set EXEC according to a thread count packed in an SGPR input:
+//    thread_count = (input >> bitoffset) & 0x7f;
+// This is always moved to the beginning of the basic block.
+def int_amdgcn_init_exec_from_input : Intrinsic<[],
+  [llvm_i32_ty,       // 32-bit SGPR input
+   llvm_i32_ty],      // bit offset of the thread count
+  [IntrConvergent]>;
+
+
 //===----------------------------------------------------------------------===//
 // Instruction Intrinsics
 //===----------------------------------------------------------------------===//
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -369,6 +369,8 @@
   BUILD_VERTICAL_VECTOR,
   /// Pointer to the start of the shader's constant data.
   CONST_DATA_PTR,
+  INIT_EXEC,
+  INIT_EXEC_FROM_INPUT,
   SENDMSG,
   SENDMSGHALT,
   INTERP_MOV,
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3516,6 +3516,8 @@
   NODE_NAME_CASE(KILL)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+  NODE_NAME_CASE(INIT_EXEC)
+  NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
   NODE_NAME_CASE(SENDMSG)
   NODE_NAME_CASE(SENDMSGHALT)
   NODE_NAME_CASE(INTERP_MOV)
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -299,6 +299,15 @@
 
 def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
+def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
+                      SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                      [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
+                                 SDTypeProfile<0, 2,
+                                 [SDTCisInt<0>, SDTCisInt<1>]>,
+                                 [SDNPHasChain, SDNPInGlue]>;
+
 def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
                     SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                     [SDNPHasChain, SDNPInGlue]>;
Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1957,6 +1957,63 @@
     MI.eraseFromParent();
     return BB;
 
+  case AMDGPU::SI_INIT_EXEC:
+    // This should be before all vector instructions.
+    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+            AMDGPU::EXEC)
+        .addImm(MI.getOperand(0).getImm());
+    MI.eraseFromParent();
+    return BB;
+
+  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
+    // Extract the thread count from an SGPR input and set EXEC accordingly.
+    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+    //
+    // S_BFE_U32 count, input, {shift, 7}
+    // S_BFM_B64 exec, count, 0
+    // S_CMP_EQ_U32 count, 64
+    // S_CMOV_B64 exec, -1
+    MachineInstr *FirstMI = &*BB->begin();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned InputReg = MI.getOperand(0).getReg();
+    unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    bool Found = false;
+
+    // Move the COPY of the input reg to the beginning, so that we can use it.
+    for (auto I = BB->begin(); I != &MI; I++) {
+      if (I->getOpcode() != TargetOpcode::COPY ||
+          I->getOperand(0).getReg() != InputReg)
+        continue;
+
+      if (I == FirstMI) {
+        FirstMI = &*++BB->begin();
+      } else {
+        I->removeFromParent();
+        BB->insert(FirstMI, &*I);
+      }
+      Found = true;
+      break;
+    }
+    assert(Found);
+
+    // This should be before all vector instructions.
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
+        .addReg(InputReg)
+        .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
+            AMDGPU::EXEC)
+        .addReg(CountReg)
+        .addImm(0);
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
+        .addReg(CountReg, RegState::Kill)
+        .addImm(64);
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
+            AMDGPU::EXEC)
+        .addImm(-1);
+    MI.eraseFromParent();
+    return BB;
+  }
+
   case AMDGPU::GET_GROUPSTATICSIZE: {
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
@@ -3224,6 +3281,14 @@
     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
                        Op.getOperand(2), Glue);
   }
+  case Intrinsic::amdgcn_init_exec: {
+    return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
+                       Op.getOperand(2));
+  }
+  case Intrinsic::amdgcn_init_exec_from_input: {
+    return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   case AMDGPUIntrinsic::SI_tbuffer_store: {
     SDValue Ops[] = {
       Chain,
Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
@@ -286,6 +286,19 @@
   let isReMaterializable = 1;
 }
 
+def SI_INIT_EXEC : SPseudoInstSI <
+  (outs), (ins i64imm:$src), []> {
+  let Defs = [EXEC];
+  let usesCustomInserter = 1;
+  let isAsCheapAsAMove = 1;
+}
+
+def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
+  (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+  let Defs = [EXEC];
+  let usesCustomInserter = 1;
+}
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -399,6 +412,16 @@
 } // End SubtargetPredicate = isGCN
 
 let Predicates = [isGCN] in {
+def : Pat <
+  (AMDGPUinit_exec i64:$src),
+  (SI_INIT_EXEC (as_i64imm $src))
+>;
+
+def : Pat <
+  (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
+  (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
+>;
+
 def : Pat<
   (AMDGPUtrap timm:$trapid),
   (S_TRAP $trapid)
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -0,0 +1,80 @@
+;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN
+
+; GCN-LABEL: {{^}}full_mask:
+; GCN: s_mov_b64 exec, -1
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @full_mask(float %a, float %b) {
+main_body:
+  %s = fadd float %a, %b
+  call void @llvm.amdgcn.init.exec(i64 -1)
+  ret float %s
+}
+
+; GCN-LABEL: {{^}}partial_mask:
+; GCN: s_mov_b64 exec, 0x1e240
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @partial_mask(float %a, float %b) {
+main_body:
+  %s = fadd float %a, %b
+  call void @llvm.amdgcn.init.exec(i64 123456)
+  ret float %s
+}
+
+; GCN-LABEL: {{^}}input_s3off8:
+; GCN: s_bfe_u32 s0, s3, 0x70008
+; GCN: s_bfm_b64 exec, s0, 0
+; GCN: s_cmp_eq_u32 s0, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
+main_body:
+  %s = fadd float %a, %b
+  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
+  ret float %s
+}
+
+; GCN-LABEL: {{^}}input_s0off19:
+; GCN: s_bfe_u32 s0, s0, 0x70013
+; GCN: s_bfm_b64 exec, s0, 0
+; GCN: s_cmp_eq_u32 s0, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
+main_body:
+  %s = fadd float %a, %b
+  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
+  ret float %s
+}
+
+; GCN-LABEL: {{^}}reuse_input:
+; GCN: s_bfe_u32 s1, s0, 0x70013
+; GCN: s_bfm_b64 exec, s1, 0
+; GCN: s_cmp_eq_u32 s1, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_i32_e32 v0, vcc, s0, v0
+define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
+main_body:
+  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
+  %s = add i32 %a, %count
+  %f = sitofp i32 %s to float
+  ret float %f
+}
+
+; GCN-LABEL: {{^}}reuse_input2:
+; GCN: s_bfe_u32 s1, s0, 0x70013
+; GCN: s_bfm_b64 exec, s1, 0
+; GCN: s_cmp_eq_u32 s1, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_i32_e32 v0, vcc, s0, v0
+define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
+main_body:
+  %s = add i32 %a, %count
+  %f = sitofp i32 %s to float
+  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
+  ret float %f
+}
+
+declare void @llvm.amdgcn.init.exec(i64) #1
+declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
+
+attributes #1 = { convergent }