Index: lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- lib/Target/AMDGPU/AMDGPUGISel.td +++ lib/Target/AMDGPU/AMDGPUGISel.td @@ -202,3 +202,6 @@ foreach Ty = [i64, p0, p1, p4] in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; } + +def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">, + GISDNodeXFormEquiv; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,6 +156,9 @@ InstructionSelector::ComplexRendererFns selectDS1Addr1Offset(MachineOperand &Root) const; + void renderTruncImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2121,3 +2121,12 @@ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } }}; } + +void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + Optional CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); + assert(CstVal && "Expected constant value"); + MIB.addImm(CstVal.getValue()); +} Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2591,7 +2591,8 @@ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: { + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_init_exec: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -2644,6 +2645,12 @@ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_init_exec_from_input: { + unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } default: if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -689,6 +689,10 @@ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); }]>; +def as_i32timm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + def as_i64imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; @@ -728,6 +732,10 @@ [{return isUInt<16>(Imm);}] >; +def i64imm_32bit : ImmLeaf(Imm); +}]>; + class InlineImm : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -349,7 +349,8 @@ } def SI_INIT_EXEC : SPseudoInstSI < - (outs), (ins i64imm:$src), []> { + (outs), (ins i64imm:$src), + [(int_amdgcn_init_exec (i64 timm:$src))]> { let Defs = [EXEC]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; @@ -365,12 +366,20 @@ let WaveSizePredicate = isWave32; } +// FIXME: Wave32 version def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < - (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + (outs), (ins SSrc_b32:$input, i32imm:$shift), + [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { let Defs = [EXEC]; let usesCustomInserter = 1; } +def : GCNPat < + (int_amdgcn_init_exec timm:$src), + (SI_INIT_EXEC_LO (as_i32imm imm:$src))> { + let WaveSizePredicate = isWave32; +} + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -609,23 +618,6 @@ (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) >; -def : GCNPat < - (int_amdgcn_init_exec i64:$src), - (SI_INIT_EXEC (as_i64imm $src))> { - let WaveSizePredicate = isWave64; -} - -def : GCNPat < - (int_amdgcn_init_exec i64:$src), - (SI_INIT_EXEC_LO (as_i32imm $src))> { - let WaveSizePredicate = isWave32; -} - -def : GCNPat < - (int_amdgcn_init_exec_from_input i32:$input, i32:$shift), - (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) ->; - def : GCNPat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) Index: test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.init.exec.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.init.exec.ll @@ -0,0 +1,2 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.init.exec.ll | FileCheck -check-prefix=GCN %S/../llvm.amdgcn.init.exec.ll +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %S/../llvm.amdgcn.init.exec.ll | FileCheck -check-prefix=GCN %S/../llvm.amdgcn.init.exec.ll Index: test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.init.exec.wave32.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.init.exec.wave32.ll @@ -0,0 +1,2 @@ +; Runs original SDAG test with -global-isel +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %S/../llvm.amdgcn.init.exec.wave32.ll | FileCheck -check-prefixes=GCN,GFX1032 %S/../llvm.amdgcn.init.exec.wave32.ll Index: test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll @@ -1,4 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}full_mask: ; GCN: s_mov_b64 exec, -1 @@ -51,7 +52,7 @@ ; GCN: s_bfm_b64 exec, s1, 0 ; GCN: s_cmp_eq_u32 s1, 64 ; GCN: s_cmov_b64 exec, -1 -; GCN: v_add_u32_e32 v0, s0, v0 +; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { main_body: call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) @@ -65,7 +66,7 @@ ; GCN: s_bfm_b64 exec, s1, 0 ; GCN: s_cmp_eq_u32 s1, 64 ; GCN: s_cmov_b64 exec, -1 -; GCN: v_add_u32_e32 v0, s0, v0 +; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { main_body: %s = add i32 %a, %count Index: test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s + +; GCN-LABEL: {{^}}test_init_exec: +; GFX1032: s_mov_b32 exec_lo, 0x12345 +; GFX1064: s_mov_b64 exec, 0x12345 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @test_init_exec(float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec(i64 74565) + ret float %s +} + +; GCN-LABEL: {{^}}test_init_exec_from_input: +; GCN: s_bfe_u32 s0, s3, 0x70008 +; GFX1032: s_bfm_b32 exec_lo, s0, 0 +; GFX1032: s_cmp_eq_u32 s0, 32 +; GFX1032: s_cmov_b32 exec_lo, -1 +; GFX1064: s_bfm_b64 exec, s0, 0 +; GFX1064: s_cmp_eq_u32 s0, 64 +; GFX1064: s_cmov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @test_init_exec_from_input(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + ret float %s +} + +declare void @llvm.amdgcn.init.exec(i64) +declare void @llvm.amdgcn.init.exec.from.input(i32, i32) Index: test/CodeGen/AMDGPU/wave32.ll =================================================================== --- test/CodeGen/AMDGPU/wave32.ll +++ test/CodeGen/AMDGPU/wave32.ll @@ -871,33 +871,6 @@ ret void } -; GCN-LABEL: {{^}}test_init_exec: -; GFX1032: s_mov_b32 exec_lo, 0x12345 -; GFX1064: s_mov_b64 exec, 0x12345 -; GCN: v_add_f32_e32 v0, -define amdgpu_ps float @test_init_exec(float %a, float %b) { -main_body: - %s = fadd float %a, %b - call void @llvm.amdgcn.init.exec(i64 74565) - ret float %s -} - -; GCN-LABEL: {{^}}test_init_exec_from_input: -; GCN: s_bfe_u32 s0, s3, 0x70008 -; GFX1032: s_bfm_b32 exec_lo, s0, 0 -; GFX1032: s_cmp_eq_u32 s0, 32 -; GFX1032: s_cmov_b32 exec_lo, -1 -; GFX1064: s_bfm_b64 exec, s0, 0 -; GFX1064: s_cmp_eq_u32 s0, 64 -; GFX1064: s_cmov_b64 exec, -1 -; GCN: v_add_f32_e32 v0, -define amdgpu_ps float @test_init_exec_from_input(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { -main_body: - %s = fadd float %a, %b - call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) - ret float %s -} - ; GCN-LABEL: {{^}}test_vgprblocks_w32_attr: ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result ; GFX10DEFWAVE: ; VGPRBlocks: 1 @@ -1132,8 +1105,6 @@ declare void @llvm.amdgcn.kill(i1) declare i1 @llvm.amdgcn.wqm.vote(i1) declare i1 @llvm.amdgcn.ps.live() -declare void @llvm.amdgcn.init.exec(i64) -declare void @llvm.amdgcn.init.exec.from.input(i32, i32) declare i64 @llvm.cttz.i64(i64, i1) declare i32 @llvm.cttz.i32(i32, i1)