Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -105,6 +105,7 @@ bool selectG_INSERT(MachineInstr &I) const; bool selectInterpP1F16(MachineInstr &MI) const; + bool selectWritelane(MachineInstr &MI) const; bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicIcmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -819,6 +819,63 @@ return true; } +// Writelane is special in that it can use SGPR and M0 (which would normally +// count as using the constant bus twice - but in this case it is allowed since +// the lane selector doesn't count as a use of the constant bus). However, it is +// still required to abide by the 1 SGPR rule. Fix this up if we might have +// multiple SGPRs. +bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { + // With a constant bus limit of at least 2, there's no issue. + if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) + return selectImpl(MI, *CoverageInfo); + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + Register VDst = MI.getOperand(0).getReg(); + Register Val = MI.getOperand(2).getReg(); + Register LaneSelect = MI.getOperand(3).getReg(); + Register VDstIn = MI.getOperand(4).getReg(); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); + + Optional ConstSelect = + getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true); + if (ConstSelect) { + // The selector has to be an inline immediate, so we can use whatever for + // the other operands. + MIB.addReg(Val); + MIB.addImm(ConstSelect->Value & + maskTrailingOnes(STI.getWavefrontSizeLog2())); + } else { + Optional ConstVal = + getConstantVRegValWithLookThrough(Val, *MRI, true, true); + + // If the value written is an inline immediate, we can get away without a + // copy to m0. + if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value, + STI.hasInv2PiInlineImm())) { + MIB.addImm(ConstVal->Value); + MIB.addReg(LaneSelect); + } else { + MIB.addReg(Val); + + // If the lane selector was originally in a VGPR and copied with + // readfirstlane, there's a hazard to read the same SGPR from the + // VALU. Constrain to a different SGPR to help avoid needing a nop later. + RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); + + BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(LaneSelect); + MIB.addReg(AMDGPU::M0); + } + } + + MIB.addReg(VDstIn); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { @@ -885,6 +942,8 @@ return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); case Intrinsic::amdgcn_wwm: return constrainCopyLikeIntrin(I, AMDGPU::WWM); + case Intrinsic::amdgcn_writelane: + return selectWritelane(I); case Intrinsic::amdgcn_div_scale: return selectDivScale(I); case Intrinsic::amdgcn_icmp: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll @@ -0,0 +1,332 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps float @test_writelane_s_s_s(i32 inreg %data, i32 inreg %lane, i32 inreg %vdst.in) #0 { +; GFX7-LABEL: test_writelane_s_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 m0, s3 +; GFX7-NEXT: v_writelane_b32 v0, s2, m0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_s_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_writelane_b32 v0, s2, m0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_s_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +define amdgpu_ps float @test_writelane_s_s_imm(i32 inreg %data, i32 inreg %lane) #0 { +; GFX7-LABEL: test_writelane_s_s_imm: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v0, 42 +; GFX7-NEXT: s_mov_b32 m0, s3 +; GFX7-NEXT: v_writelane_b32 v0, s2, m0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_s_s_imm: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, 42 +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_writelane_b32 v0, s2, m0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_s_s_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 %lane, i32 42) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +; data is not inline imm +define amdgpu_ps float @test_writelane_k_s_v(i32 inreg %lane, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_k_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_movk_i32 s0, 0x3e7 +; GFX7-NEXT: s_mov_b32 m0, s2 +; GFX7-NEXT: v_writelane_b32 v0, s0, m0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_k_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s0, 0x3e7 +; GFX8-NEXT: s_mov_b32 m0, s2 +; GFX8-NEXT: v_writelane_b32 v0, s0, m0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_k_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s0, 0x3e7 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 999, i32 %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +; Data is inline imm +define amdgpu_ps float @test_writelane_imm_s_v(i32 inreg %lane, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_imm_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_writelane_b32 v0, 42, s2 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_imm_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_writelane_b32 v0, 42, s2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_imm_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_writelane_b32 v0, 42, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 42, i32 %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +; Data is subtarget dependent inline imm +define amdgpu_ps float @test_writelane_imminv2pi_s_v(i32 inreg %lane, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_imminv2pi_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, 0x3e22f983 +; GFX7-NEXT: s_mov_b32 m0, s2 +; GFX7-NEXT: v_writelane_b32 v0, s0, m0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_imminv2pi_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_writelane_b32 v0, 0.15915494, s2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_imminv2pi_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_writelane_b32 v0, 0.15915494, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 bitcast (float 0x3FC45F3060000000 to i32), i32 %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + + +; Lane is inline imm +define amdgpu_ps float @test_writelane_s_imm_v(i32 inreg %data, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_s_imm_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_writelane_b32 v0, s2, 23 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_s_imm_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_writelane_b32 v0, s2, 23 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_s_imm_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_writelane_b32 v0, s2, 23 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 23, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +; Lane index is larger than the wavesize +define amdgpu_ps float @test_writelane_s_k0_v(i32 inreg %data, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_s_k0_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_writelane_b32 v0, s2, 3 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_s_k0_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_writelane_b32 v0, s2, 3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_s_k0_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s0, 0x43 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v0, s2, s0 +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 67, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +; Lane index is larger than the wavesize for wave32 +define amdgpu_ps float @test_writelane_s_k1_v(i32 inreg %data, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_s_k1_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_writelane_b32 v0, s2, 32 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_s_k1_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_writelane_b32 v0, s2, 32 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_s_k1_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_writelane_b32 v0, s2, 32 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 32, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +define amdgpu_ps float @test_writelane_v_v_v(i32 %data, i32 %lane, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_v_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_mov_b32 m0, s1 +; GFX7-NEXT: v_writelane_b32 v2, s0, m0 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_v_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_mov_b32 m0, s1 +; GFX8-NEXT: v_writelane_b32 v2, s0, m0 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_v_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v2, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +define amdgpu_ps float @test_writelane_v_s_v(i32 %data, i32 inreg %lane, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_v_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_mov_b32 m0, s2 +; GFX7-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_v_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_mov_b32 m0, s2 +; GFX8-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_v_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v1, s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 inreg %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + + +define amdgpu_ps float @test_writelane_m0_s_v(i32 inreg %lane, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_m0_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: ;;#ASMSTART +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ;;#ASMEND +; GFX7-NEXT: s_mov_b32 s0, m0 +; GFX7-NEXT: s_mov_b32 m0, s2 +; GFX7-NEXT: v_writelane_b32 v0, s0, m0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_m0_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_mov_b32 s0, m0 +; GFX8-NEXT: s_mov_b32 m0, s2 +; GFX8-NEXT: v_writelane_b32 v0, s0, m0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_m0_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: s_mov_b32 m0, -1 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v0, m0, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() + %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %lane, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +define amdgpu_ps float @test_writelane_s_m0_v(i32 inreg %data, i32 %vdst.in) #0 { +; GFX7-LABEL: test_writelane_s_m0_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: ;;#ASMSTART +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ;;#ASMEND +; GFX7-NEXT: v_writelane_b32 v0, s2, m0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: test_writelane_s_m0_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: v_writelane_b32 v0, s2, m0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_writelane_s_m0_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: s_mov_b32 m0, -1 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v0, s2, m0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() + %writelane = call i32 @llvm.amdgcn.writelane(i32 %data, i32 %m0, i32 %vdst.in) + %writelane.cast = bitcast i32 %writelane to float + ret float %writelane.cast +} + +declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind readnone willreturn } +attributes #2 = { nounwind readnone speculatable willreturn }