diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4025,77 +4025,6 @@ MI.eraseFromParent(); return BB; } - case AMDGPU::SI_INIT_EXEC: - // This should be before all vector instructions. - BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::EXEC) - .addImm(MI.getOperand(0).getImm()); - MI.eraseFromParent(); - return BB; - - case AMDGPU::SI_INIT_EXEC_LO: - // This should be before all vector instructions. - BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::EXEC_LO) - .addImm(MI.getOperand(0).getImm()); - MI.eraseFromParent(); - return BB; - - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { - // Extract the thread count from an SGPR input and set EXEC accordingly. - // Since BFM can't shift by 64, handle that case with CMP + CMOV. - // - // S_BFE_U32 count, input, {shift, 7} - // S_BFM_B64 exec, count, 0 - // S_CMP_EQ_U32 count, 64 - // S_CMOV_B64 exec, -1 - MachineInstr *FirstMI = &*BB->begin(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register InputReg = MI.getOperand(0).getReg(); - Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - bool Found = false; - - // Move the COPY of the input reg to the beginning, so that we can use it. - for (auto I = BB->begin(); I != &MI; I++) { - if (I->getOpcode() != TargetOpcode::COPY || - I->getOperand(0).getReg() != InputReg) - continue; - - if (I == FirstMI) { - FirstMI = &*++BB->begin(); - } else { - I->removeFromParent(); - BB->insert(FirstMI, &*I); - } - Found = true; - break; - } - assert(Found); - (void)Found; - - // This should be before all vector instructions. - unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; - bool isWave32 = getSubtarget()->isWave32(); - unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) - .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), - TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), - Exec) - .addReg(CountReg) - .addImm(0); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) - .addReg(CountReg, RegState::Kill) - .addImm(getSubtarget()->getWavefrontSize()); - BuildMI(*BB, FirstMI, DebugLoc(), - TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); - MI.eraseFromParent(); - return BB; - } - case AMDGPU::GET_GROUPSTATICSIZE: { assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1860,6 +1860,83 @@ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } + case AMDGPU::SI_INIT_EXEC: { + // This should be before all vector instructions. + const bool isWave32 = ST.isWave32(); + BuildMI(MBB, MBB.begin(), MI.getDebugLoc(), + get(isWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + break; + } + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + MachineFunction &MF = *MBB.getParent(); + MachineInstr *FirstMI = nullptr; + Register InputReg = MI.getOperand(0).getReg(); + bool isInputRead = false; + + // Find definition of InputReg. + // Usually InputReg is a function live in, so there will not be one. + for (MachineBasicBlock::iterator I = MBB.begin(), E = MI.getIterator(); + I != E && !FirstMI; ++I) { + isInputRead = isInputRead || I->readsRegister(InputReg); + if (I->definesRegister(InputReg)) + FirstMI = &*(std::next(I)); + } + + // Select a temporary register to hold the count. + Register CountReg; + if (!isInputRead && MI.getOperand(0).isKill()) { + // InputReg only used for init -> reuse input register + CountReg = InputReg; + } else if (!FirstMI && (MF.begin() == MBB.getIterator())) { + // Insert at function start -> all non live-ins available + // Use VCC as that is always available. + CountReg = AMDGPU::VCC_LO; + } else { + // Rare edge case -> try to find a free register + RegScavenger RS; + RS.enterBasicBlock(MBB); + if (FirstMI) + RS.forward(FirstMI); + CountReg = RS.FindUnusedReg(&AMDGPU::SGPR_32RegClass); + } + if (!CountReg) + report_fatal_error("Cannot find register to build EXEC init mask"); + + // Update insertion point + if (!FirstMI) + FirstMI = &*MBB.begin(); + + // Insert initialisation sequence + const DebugLoc DL = MI.getDebugLoc(); + const unsigned Mask = (ST.getWavefrontSize() << 1) - 1; + const bool isWave32 = ST.isWave32(); + const Register Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, FirstMI, DL, get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + BuildMI(MBB, FirstMI, DL, + get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) + .addReg(CountReg) + .addImm(0); + BuildMI(MBB, FirstMI, DL, get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(ST.getWavefrontSize()); + BuildMI(MBB, FirstMI, DL, + get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), Exec) + .addImm(-1); + MI.eraseFromParent(); + break; + } } return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -399,32 +399,13 @@ (outs), (ins i64imm:$src), [(int_amdgcn_init_exec (i64 timm:$src))]> { let Defs = [EXEC]; - let usesCustomInserter = 1; - let isAsCheapAsAMove = 1; - let WaveSizePredicate = isWave64; -} - -// FIXME: Intrinsic should be mangled for wave size. -def SI_INIT_EXEC_LO : SPseudoInstSI < - (outs), (ins i32imm:$src), []> { - let Defs = [EXEC_LO]; - let usesCustomInserter = 1; let isAsCheapAsAMove = 1; - let WaveSizePredicate = isWave32; } -// FIXME: Wave32 version def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < (outs), (ins SSrc_b32:$input, i32imm:$shift), [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { let Defs = [EXEC]; - let usesCustomInserter = 1; -} - -def : GCNPat < - (int_amdgcn_init_exec timm:$src), - (SI_INIT_EXEC_LO (as_i32timm timm:$src))> { - let WaveSizePredicate = isWave32; } // Return for returning shaders to a shader variant epilog. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll @@ -22,9 +22,9 @@ } ; GCN-LABEL: {{^}}input_s3off8: -; GCN: s_bfe_u32 s0, s3, 0x70008 -; GCN: s_bfm_b64 exec, s0, 0 -; GCN: s_cmp_eq_u32 s0, 64 +; GCN: s_bfe_u32 s3, s3, 0x70008 +; GCN: s_bfm_b64 exec, s3, 0 +; GCN: s_cmp_eq_u32 s3, 64 ; GCN: s_cmov_b64 exec, -1 ; GCN: v_add_f32_e32 v0, define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { @@ -48,9 +48,9 @@ } ; GCN-LABEL: {{^}}reuse_input: -; GCN: s_bfe_u32 s1, s0, 0x70013 -; GCN: s_bfm_b64 exec, s1, 0 -; GCN: s_cmp_eq_u32 s1, 64 +; GCN: s_bfe_u32 vcc_lo, s0, 0x70013 +; GCN: s_bfm_b64 exec, vcc_lo, 0 +; GCN: s_cmp_eq_u32 vcc_lo, 64 ; GCN: s_cmov_b64 exec, -1 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { @@ -62,9 +62,9 @@ } ; GCN-LABEL: {{^}}reuse_input2: -; GCN: s_bfe_u32 s1, s0, 0x70013 -; GCN: s_bfm_b64 exec, s1, 0 -; GCN: s_cmp_eq_u32 s1, 64 +; GCN: s_bfe_u32 vcc_lo, s0, 0x70013 +; GCN: s_bfm_b64 exec, vcc_lo, 0 +; GCN: s_cmp_eq_u32 vcc_lo, 64 ; GCN: s_cmov_b64 exec, -1 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { @@ -84,6 +84,116 @@ unreachable } +; GCN-LABEL: {{^}}init_exec_before_frame_materialize: +; GCN-NOT: {{^}}v_ +; GCN: s_mov_b64 exec, -1 +; GCN: v_mov +; GCN: v_add +define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) { +main_body: + %array0 = alloca [1024 x i32], align 16, addrspace(5) + %array1 = alloca [20 x i32], align 16, addrspace(5) + call void @llvm.amdgcn.init.exec(i64 -1) + + %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr0, align 4 + + %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr1, align 4 + + %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 + store i32 %b, i32 addrspace(5)* %ptr2, align 4 + + %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b + %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 + + %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b + %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 + + %v5 = add i32 %v3, %v4 + %v = bitcast i32 %v5 to float + ret float %v +} + +; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize: +; GCN-NOT: {{^}}v_ +; GCN: s_bfe_u32 s2, s2, 0x70008 +; GCN: s_bfm_b64 exec, s2, 0 +; GCN: s_cmp_eq_u32 s2, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_mov +; GCN: v_add +define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) { +main_body: + %array0 = alloca [1024 x i32], align 16, addrspace(5) + %array1 = alloca [20 x i32], align 16, addrspace(5) + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + + %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr0, align 4 + + %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr1, align 4 + + %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 + store i32 %b, i32 addrspace(5)* %ptr2, align 4 + + %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b + %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 + + %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b + %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 + + %v5 = add i32 %v3, %v4 + %v = bitcast i32 %v5 to float + ret float %v +} + +; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_scavenge: +; GCN-NOT: {{^}}v_ +; GCN: %endif +; GCN: s_bfe_u32 s3, s2, 0x70008 +; GCN: s_bfm_b64 exec, s3, 0 +; GCN: s_cmp_eq_u32 s3, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_mov +; GCN: v_add +define amdgpu_ps float @init_exec_input_before_frame_materialize_scavenge(i32 inreg %a, i32 inreg %b, i32 inreg %count) { +main_body: + %cc = icmp uge i32 %count, 32 + br i1 %cc, label %endif, label %if + +if: + call void asm sideeffect "", ""() + br label %endif + +endif: + %array0 = alloca [1024 x i32], align 16, addrspace(5) + %array1 = alloca [20 x i32], align 16, addrspace(5) + + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + + %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr0, align 4 + + %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr1, align 4 + + %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 + store i32 %b, i32 addrspace(5)* %ptr2, align 4 + + %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b + %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 + + %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b + %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 + + %v5 = add i32 %v3, %v4 + %v6 = add i32 %v5, %count + %v = bitcast i32 %v6 to float + ret float %v +} + declare void @llvm.amdgcn.init.exec(i64) #1 declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll @@ -12,9 +12,9 @@ } ; GCN-LABEL: {{^}}test_init_exec_from_input: -; GCN: s_bfe_u32 s0, s3, 0x70008 -; GFX1032: s_bfm_b32 exec_lo, s0, 0 -; GFX1032: s_cmp_eq_u32 s0, 32 +; GCN: s_bfe_u32 s3, s3, 0x70008 +; GFX1032: s_bfm_b32 exec_lo, s3, 0 +; GFX1032: s_cmp_eq_u32 s3, 32 ; GFX1032: s_cmov_b32 exec_lo, -1 ; GFX1064: s_bfm_b64 exec, s0, 0 ; GFX1064: s_cmp_eq_u32 s0, 64