Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -126,7 +126,7 @@ bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; std::pair - selectVOP3ModsImpl(Register Src) const; + selectVOP3ModsImpl(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2135,8 +2135,9 @@ } std::pair -AMDGPUInstructionSelector::selectVOP3ModsImpl( - Register Src) const { +AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { + Register Src = Root.getReg(); + Register OrigSrc = Src; unsigned Mods = 0; MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); @@ -2151,6 +2152,20 @@ Mods |= SISrcMods::ABS; } + if (Mods != 0 && + RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { + MachineInstr *UseMI = Root.getParent(); + + // If we looked through copies to find source modifiers on an SGPR operand, + // we now have an SGPR register source. To avoid potentially violating the + // constant bus restriction, we need to insert a copy to a VGPR. + Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); + BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), + TII.get(AMDGPU::COPY), VGPRSrc) + .addReg(Src); + Src = VGPRSrc; + } + return std::make_pair(Src, Mods); } @@ -2168,7 +2183,7 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -2191,7 +2206,7 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -2215,7 +2230,7 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) return None; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll @@ -228,6 +228,73 @@ ret float %result } +define amdgpu_ps float @select_vcc_s_s(float %cmp0, float %cmp1, float inreg %src0, float inreg %src1) { + ; GFX9-LABEL: name: select_vcc_s_s + ; GFX9: bb.1 (%ir-block.0): + ; GFX9: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX9: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] + ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) + ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] + ; GFX9: $vgpr0 = COPY [[SELECT]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX10-LABEL: name: select_vcc_s_s + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX10: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] + ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) + ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] + ; GFX10: $vgpr0 = COPY [[SELECT]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 + %cmp = fcmp oeq float %cmp0, %cmp1 + %result = select i1 %cmp, float %src0, float %src1 + ret float %result +} + +define amdgpu_ps float @select_vcc_fneg_s_s(float %cmp0, float %cmp1, float inreg %src0, float inreg %src1) { + ; GFX9-LABEL: name: select_vcc_fneg_s_s + ; GFX9: bb.1 (%ir-block.0): + ; GFX9: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX9: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] + ; GFX9: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY2]] + ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[FNEG]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) + ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] + ; GFX9: $vgpr0 = COPY [[SELECT]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX10-LABEL: name: select_vcc_fneg_s_s + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX10: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] + ; GFX10: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY2]] + ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[FNEG]](s32) + ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) + ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] + ; GFX10: $vgpr0 = COPY [[SELECT]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 + %cmp = fcmp oeq float %cmp0, %cmp1 + %neg.src0 = fneg float %src0 + %result = select i1 %cmp, float %neg.src0, float %src1 + ret float %result +} + ; Constant bus used by vcc define amdgpu_ps float @amdgcn_div_fmas_sss(float inreg %src, float %cmp.src) { ; GFX9-LABEL: name: amdgcn_div_fmas_sss Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir @@ -165,7 +165,8 @@ ; GFX6-LABEL: name: fadd_s32_fneg_copy_sgpr ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 1, [[COPY2]], 0, 0, implicit $exec ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 @@ -202,3 +203,90 @@ S_ENDPGM 0, implicit %6 ... + +# The source modifier lookup searches through SGPR->VGPR copies. Make +# sure we don't violate the constant bus restriction when we look at +# the source. + +--- + +name: fadd_s32_copy_fabs_sgpr_copy_fabs_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; GFX6-LABEL: name: fadd_s32_copy_fabs_sgpr_copy_fabs_sgpr + ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 2, [[COPY2]], 2, [[COPY3]], 0, 0, implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = G_FABS %0 + %3:sgpr(s32) = G_FABS %1 + %4:vgpr(s32) = COPY %2 + %5:vgpr(s32) = COPY %3 + %6:vgpr(s32) = G_FADD %4, %5 + S_ENDPGM 0, implicit %6 + +... + +--- + +name: fadd_s32_copy_fneg_sgpr_copy_fneg_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; GFX6-LABEL: name: fadd_s32_copy_fneg_sgpr_copy_fneg_sgpr + ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 1, [[COPY2]], 1, [[COPY3]], 0, 0, implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = G_FNEG %0 + %3:sgpr(s32) = G_FNEG %1 + %4:vgpr(s32) = COPY %2 + %5:vgpr(s32) = COPY %3 + %6:vgpr(s32) = G_FADD %4, %5 + S_ENDPGM 0, implicit %6 + +... + +--- + +name: fadd_s32_copy_fneg_fabs_sgpr_copy_fneg_fabs_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; GFX6-LABEL: name: fadd_s32_copy_fneg_fabs_sgpr_copy_fneg_fabs_sgpr + ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 3, [[COPY2]], 3, [[COPY3]], 0, 0, implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = G_FABS %0 + %3:sgpr(s32) = G_FABS %1 + %4:sgpr(s32) = G_FNEG %2 + %5:sgpr(s32) = G_FNEG %3 + %6:vgpr(s32) = COPY %4 + %7:vgpr(s32) = COPY %5 + %8:vgpr(s32) = G_FADD %6, %7 + S_ENDPGM 0, implicit %8 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir @@ -166,7 +166,8 @@ ; GFX6-LABEL: name: fadd_s64_fneg_copy_sgpr ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 1, [[COPY2]], 0, 0, implicit $exec ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:sgpr(s64) = COPY $sgpr0_sgpr1 @@ -176,3 +177,90 @@ S_ENDPGM 0, implicit %4 ... + +# The source modifier lookup searches through SGPR->VGPR copies. Make +# sure we don't violate the constant bus restriction when we look at +# the source. + +--- + +name: fadd_s64_copy_fabs_sgpr_copy_fabs_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GFX6-LABEL: name: fadd_s64_copy_fabs_sgpr_copy_fabs_sgpr + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX6: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 2, [[COPY2]], 2, [[COPY3]], 0, 0, implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = COPY $sgpr2_sgpr3 + %2:sgpr(s64) = G_FABS %0 + %3:sgpr(s64) = G_FABS %1 + %4:vgpr(s64) = COPY %2 + %5:vgpr(s64) = COPY %3 + %6:vgpr(s64) = G_FADD %4, %5 + S_ENDPGM 0, implicit %6 + +... + +--- + +name: fadd_s64_copy_fneg_sgpr_copy_fneg_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; GFX6-LABEL: name: fadd_s64_copy_fneg_sgpr_copy_fneg_sgpr + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX6: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 1, [[COPY2]], 1, [[COPY3]], 0, 0, implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = COPY $sgpr2_sgpr3 + %2:sgpr(s64) = G_FNEG %0 + %3:sgpr(s64) = G_FNEG %1 + %4:vgpr(s64) = COPY %2 + %5:vgpr(s64) = COPY %3 + %6:vgpr(s64) = G_FADD %4, %5 + S_ENDPGM 0, implicit %6 + +... + +--- + +name: fadd_s64_copy_fneg_fabs_sgpr_copy_fneg_fabs_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; GFX6-LABEL: name: fadd_s64_copy_fneg_fabs_sgpr_copy_fneg_fabs_sgpr + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX6: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 3, [[COPY2]], 3, [[COPY3]], 0, 0, implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = COPY $sgpr2_sgpr3 + %2:sgpr(s64) = G_FABS %0 + %3:sgpr(s64) = G_FABS %1 + %4:sgpr(s64) = G_FNEG %2 + %5:sgpr(s64) = G_FNEG %3 + %6:vgpr(s64) = COPY %4 + %7:vgpr(s64) = COPY %5 + %8:vgpr(s64) = G_FADD %6, %7 + S_ENDPGM 0, implicit %8 + +...