diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -182,6 +182,8 @@ // Set EXEC according to a thread count packed in an SGPR input: // thread_count = (input >> bitoffset) & 0x7f; // This is always moved to the beginning of the basic block. +// Note: only inreg arguments to the parent function are valid as +// inputs to this intrinsic, computed values cannot be used. def int_amdgcn_init_exec_from_input : Intrinsic<[], [llvm_i32_ty, // 32-bit SGPR input llvm_i32_ty], // bit offset of the thread count diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4021,77 +4021,6 @@ MI.eraseFromParent(); return BB; } - case AMDGPU::SI_INIT_EXEC: - // This should be before all vector instructions. - BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::EXEC) - .addImm(MI.getOperand(0).getImm()); - MI.eraseFromParent(); - return BB; - - case AMDGPU::SI_INIT_EXEC_LO: - // This should be before all vector instructions. - BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::EXEC_LO) - .addImm(MI.getOperand(0).getImm()); - MI.eraseFromParent(); - return BB; - - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { - // Extract the thread count from an SGPR input and set EXEC accordingly. - // Since BFM can't shift by 64, handle that case with CMP + CMOV. - // - // S_BFE_U32 count, input, {shift, 7} - // S_BFM_B64 exec, count, 0 - // S_CMP_EQ_U32 count, 64 - // S_CMOV_B64 exec, -1 - MachineInstr *FirstMI = &*BB->begin(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register InputReg = MI.getOperand(0).getReg(); - Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - bool Found = false; - - // Move the COPY of the input reg to the beginning, so that we can use it. - for (auto I = BB->begin(); I != &MI; I++) { - if (I->getOpcode() != TargetOpcode::COPY || - I->getOperand(0).getReg() != InputReg) - continue; - - if (I == FirstMI) { - FirstMI = &*++BB->begin(); - } else { - I->removeFromParent(); - BB->insert(FirstMI, &*I); - } - Found = true; - break; - } - assert(Found); - (void)Found; - - // This should be before all vector instructions. - unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; - bool isWave32 = getSubtarget()->isWave32(); - unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) - .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), - TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), - Exec) - .addReg(CountReg) - .addImm(0); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) - .addReg(CountReg, RegState::Kill) - .addImm(getSubtarget()->getWavefrontSize()); - BuildMI(*BB, FirstMI, DebugLoc(), - TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); - MI.eraseFromParent(); - return BB; - } - case AMDGPU::GET_GROUPSTATICSIZE: { assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -399,32 +399,13 @@ (outs), (ins i64imm:$src), [(int_amdgcn_init_exec (i64 timm:$src))]> { let Defs = [EXEC]; - let usesCustomInserter = 1; - let isAsCheapAsAMove = 1; - let WaveSizePredicate = isWave64; -} - -// FIXME: Intrinsic should be mangled for wave size. -def SI_INIT_EXEC_LO : SPseudoInstSI < - (outs), (ins i32imm:$src), []> { - let Defs = [EXEC_LO]; - let usesCustomInserter = 1; let isAsCheapAsAMove = 1; - let WaveSizePredicate = isWave32; } -// FIXME: Wave32 version def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < (outs), (ins SSrc_b32:$input, i32imm:$shift), [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { let Defs = [EXEC]; - let usesCustomInserter = 1; -} - -def : GCNPat < - (int_amdgcn_init_exec timm:$src), - (SI_INIT_EXEC_LO (as_i32timm timm:$src))> { - let WaveSizePredicate = isWave32; } // Return for returning shaders to a shader variant epilog. diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -93,6 +93,8 @@ MachineBasicBlock *emitEndCf(MachineInstr &MI); + void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI); + void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl &Src) const; @@ -661,6 +663,90 @@ return SplitBB; } +void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, + MachineInstr &MI) { + MachineFunction &MF = *MBB->getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + bool IsWave32 = ST.isWave32(); + + if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { + // This should be before all vector instructions. + BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) + .addImm(MI.getOperand(0).getImm()); + if (LIS) + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + return; + } + + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + Register InputReg = MI.getOperand(0).getReg(); + MachineInstr *FirstMI = &*MBB->begin(); + if (InputReg.isVirtual()) { + MachineInstr *DefInstr = MRI->getVRegDef(InputReg); + assert(DefInstr && DefInstr->isCopy()); + if (DefInstr->getParent() == MBB) { + if (DefInstr != FirstMI) { + // If the `InputReg` is defined in current block, we also need to + // move that instruction to the beginning of the block. + DefInstr->removeFromParent(); + MBB->insert(FirstMI, DefInstr); + if (LIS) + LIS->handleMove(*DefInstr); + } else { + // If first instruction is definition then move pointer after it. + FirstMI = &*std::next(FirstMI->getIterator()); + } + } + } + + // Insert instruction sequence at block beginning (before vector operations). + const DebugLoc DL = MI.getDebugLoc(); + const unsigned WavefrontSize = ST.getWavefrontSize(); + const unsigned Mask = (WavefrontSize << 1) - 1; + Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + auto BfmMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) + .addReg(CountReg) + .addImm(0); + auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(WavefrontSize); + auto CmovMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) + .addImm(-1); + + if (!LIS) { + MI.eraseFromParent(); + return; + } + + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + + LIS->InsertMachineInstrInMaps(*BfeMI); + LIS->InsertMachineInstrInMaps(*BfmMI); + LIS->InsertMachineInstrInMaps(*CmpMI); + LIS->InsertMachineInstrInMaps(*CmovMI); + + LIS->removeInterval(InputReg); + LIS->createAndComputeVirtRegInterval(InputReg); + LIS->createAndComputeVirtRegInterval(CountReg); +} + bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * { auto *S = B->getNextNode(); @@ -781,6 +867,14 @@ SplitMBB = process(MI); break; + // FIXME: find a better place for this + case AMDGPU::SI_INIT_EXEC: + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: + lowerInitExec(MBB, MI); + if (LIS) + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); + break; + default: break; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll @@ -84,6 +84,117 @@ unreachable } +; GCN-LABEL: {{^}}init_exec_before_frame_materialize: +; GCN-NOT: {{^}}v_ +; GCN: s_mov_b64 exec, -1 +; GCN: v_mov +; GCN: v_add +define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) { +main_body: + %array0 = alloca [1024 x i32], align 16, addrspace(5) + %array1 = alloca [20 x i32], align 16, addrspace(5) + call void @llvm.amdgcn.init.exec(i64 -1) + + %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr0, align 4 + + %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr1, align 4 + + %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 + store i32 %b, i32 addrspace(5)* %ptr2, align 4 + + %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b + %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 + + %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b + %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 + + %v5 = add i32 %v3, %v4 + %v = bitcast i32 %v5 to float + ret float %v +} + +; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize: +; GCN-NOT: {{^}}v_ +; GCN: s_bfe_u32 s2, s2, 0x70008 +; GCN-NEXT: s_bfm_b64 exec, s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 64 +; GCN-NEXT: s_cmov_b64 exec, -1 +; GCN: v_mov +; GCN: v_add +define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) { +main_body: + %array0 = alloca [1024 x i32], align 16, addrspace(5) + %array1 = alloca [20 x i32], align 16, addrspace(5) + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + + %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr0, align 4 + + %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr1, align 4 + + %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 + store i32 %b, i32 addrspace(5)* %ptr2, align 4 + + %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b + %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 + + %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b + %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 + + %v5 = add i32 %v3, %v4 + %v = bitcast i32 %v5 to float + ret float %v +} + +; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry: +; GCN-NOT: {{^}}v_ +; GCN: %endif +; GCN: s_bfe_u32 s3, s2, 0x70008 +; GCN-NEXT: s_bfm_b64 exec, s3, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, 64 +; GCN-NEXT: s_cmov_b64 exec, -1 +; GCN: v_mov +; GCN: v_add +define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) { +main_body: + ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel + %array0 = alloca [1024 x i32], align 16, addrspace(5) + %array1 = alloca [20 x i32], align 16, addrspace(5) + + %cc = icmp uge i32 %count, 32 + br i1 %cc, label %endif, label %if + +if: + call void asm sideeffect "", ""() + br label %endif + +endif: + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + + %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr0, align 4 + + %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 + store i32 %a, i32 addrspace(5)* %ptr1, align 4 + + %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 + store i32 %b, i32 addrspace(5)* %ptr2, align 4 + + %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b + %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 + + %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b + %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 + + %v5 = add i32 %v3, %v4 + %v6 = add i32 %v5, %count + %v = bitcast i32 %v6 to float + ret float %v +} + declare void @llvm.amdgcn.init.exec(i64) #1 declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1