Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -108,6 +108,21 @@ GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; +// Set EXEC to the 64-bit value given. +// This is always moved to the beginning of the basic block. +def int_amdgcn_init_exec : Intrinsic<[], + [llvm_i64_ty], // 64-bit literal constant + [IntrConvergent]>; + +// Set EXEC according to a thread count packed in an SGPR input: +// thread_count = (input >> bitoffset) & 0x7f; +// This is always moved to the beginning of the basic block. +def int_amdgcn_init_exec_from_input : Intrinsic<[], + [llvm_i32_ty, // 32-bit SGPR input + llvm_i32_ty], // bit offset of the thread count + [IntrConvergent]>; + + //===----------------------------------------------------------------------===// // Instruction Intrinsics //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -369,6 +369,8 @@ BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, + INIT_EXEC, + INIT_EXEC_FROM_INPUT, SENDMSG, SENDMSGHALT, INTERP_MOV, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3516,6 +3516,8 @@ NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(INIT_EXEC) + NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -299,6 +299,15 @@ def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT", + SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPInGlue]>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1957,6 +1957,63 @@ MI.eraseFromParent(); return BB; + case AMDGPU::SI_INIT_EXEC: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::EXEC) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + MachineInstr *FirstMI = &*BB->begin(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned InputReg = MI.getOperand(0).getReg(); + unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + bool Found = false; + + // Move the COPY of the input reg to the beginning, so that we can use it. + for (auto I = BB->begin(); I != &MI; I++) { + if (I->getOpcode() != TargetOpcode::COPY || + I->getOperand(0).getReg() != InputReg) + continue; + + if (I == FirstMI) { + FirstMI = &*++BB->begin(); + } else { + I->removeFromParent(); + BB->insert(FirstMI, &*I); + } + Found = true; + break; + } + assert(Found); + + // This should be before all vector instructions. + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), + AMDGPU::EXEC) + .addReg(CountReg) + .addImm(0); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(64); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), + AMDGPU::EXEC) + .addImm(-1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::GET_GROUPSTATICSIZE: { DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) @@ -3224,6 +3281,14 @@ return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_init_exec: { + return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, + Op.getOperand(2)); + } + case Intrinsic::amdgcn_init_exec_from_input: { + return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, + Op.getOperand(2), Op.getOperand(3)); + } case AMDGPUIntrinsic::SI_tbuffer_store: { SDValue Ops[] = { Chain, Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -286,6 +286,19 @@ let isReMaterializable = 1; } +def SI_INIT_EXEC : SPseudoInstSI < + (outs), (ins i64imm:$src), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; +} + +def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < + (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; +} + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -399,6 +412,16 @@ } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { +def : Pat < + (AMDGPUinit_exec i64:$src), + (SI_INIT_EXEC (as_i64imm $src)) +>; + +def : Pat < + (AMDGPUinit_exec_from_input i32:$input, i32:$shift), + (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) +>; + def : Pat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll @@ -0,0 +1,80 @@ +;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN + +; GCN-LABEL: {{^}}full_mask: +; GCN: s_mov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @full_mask(float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec(i64 -1) + ret float %s +} + +; GCN-LABEL: {{^}}partial_mask: +; GCN: s_mov_b64 exec, 0x1e240 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @partial_mask(float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec(i64 123456) + ret float %s +} + +; GCN-LABEL: {{^}}input_s3off8: +; GCN: s_bfe_u32 s0, s3, 0x70008 +; GCN: s_bfm_b64 exec, s0, 0 +; GCN: s_cmp_eq_u32 s0, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + ret float %s +} + +; GCN-LABEL: {{^}}input_s0off19: +; GCN: s_bfe_u32 s0, s0, 0x70013 +; GCN: s_bfm_b64 exec, s0, 0 +; GCN: s_cmp_eq_u32 s0, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) + ret float %s +} + +; GCN-LABEL: {{^}}reuse_input: +; GCN: s_bfe_u32 s1, s0, 0x70013 +; GCN: s_bfm_b64 exec, s1, 0 +; GCN: s_cmp_eq_u32 s1, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_i32_e32 v0, vcc, s0, v0 +define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { +main_body: + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) + %s = add i32 %a, %count + %f = sitofp i32 %s to float + ret float %f +} + +; GCN-LABEL: {{^}}reuse_input2: +; GCN: s_bfe_u32 s1, s0, 0x70013 +; GCN: s_bfm_b64 exec, s1, 0 +; GCN: s_cmp_eq_u32 s1, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_i32_e32 v0, vcc, s0, v0 +define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { +main_body: + %s = add i32 %a, %count + %f = sitofp i32 %s to float + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) + ret float %f +} + +declare void @llvm.amdgcn.init.exec(i64) #1 +declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1 + +attributes #1 = { convergent }