Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2611,6 +2611,10 @@ if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) return false; + // Can it be shrunk to a valid 32 bit opcode? + if (!hasVALU32BitEncoding(MI.getOpcode())) + return false; + // Check output modifiers return !hasModifiersSet(MI, AMDGPU::OpName::omod) && !hasModifiersSet(MI, AMDGPU::OpName::clamp); Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -90,7 +90,9 @@ bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; + bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; + void pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -854,7 +856,76 @@ } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, +// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and +// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA +// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. +// +// We are transforming from a VOP3 into a VOP2 form of the instruction. +// %19:vgpr_32 = V_AND_B32_e32 255, +// killed %16:vgpr_32, implicit $exec +// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 +// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec +// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 +// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec +// +// becomes +// %47:vgpr_32 = V_ADD_I32_sdwa +// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, +// implicit-def $vcc, implicit $exec +// %48:vgpr_32 = V_ADDC_U32_e32 +// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec +void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const { + int Opc = MI.getOpcode(); + assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && + "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); + + // Can the candidate MI be shrunk? + if (!TII->canShrink(MI, *MRI)) + return; + Opc = AMDGPU::getVOPe32(Opc); + // Find the related ADD instruction. + const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Sdst) + return; + MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); + if (!NextOp) + return; + MachineInstr &MISucc = *NextOp->getParent(); + // Can the successor be shrunk? + if (!TII->canShrink(MISucc, *MRI)) + return; + int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); + // Make sure the carry in/out are subsequently unused. + MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); + if (!CarryIn) + return; + MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); + if (!CarryOut) + return; + if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) + return; + // CarryIn has one use only, make sure its Def is V_{SUB|ADD}_I32_e64 (MI). + MachineInstr *MICI = MRI->getUniqueVRegDef(CarryIn->getReg()); + if (MICI != &MI) + return; + // Make the two new e32 instruction variants. + // Replace MI with V_{SUB|ADD}_I32_e32 + auto NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); + MI.eraseFromParent(); + // Replace MISucc with V_{SUBB|ADDC}_U32_e32 + auto NewInst = BuildMI(*MISucc.getParent(), MISucc, MISucc.getDebugLoc(), + TII->get(SuccOpc)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); + MISucc.eraseFromParent(); +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); @@ -882,7 +953,6 @@ (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) return false; - } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { return false; @@ -1127,6 +1197,22 @@ for (MachineBasicBlock &MBB : MF) { bool Changed = false; do { + // Preprocess the ADD/SUB pairs so they could be SDWA'ed. + // Look for a possible ADD or SUB that resulted from a previously lowered + // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 + // lowers the pair of instructions into e32 form. + matchSDWAOperands(MBB); + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && + (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || + PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) + pseudoOpConvertToVOP2(*PotentialMI, ST); + } + SDWAOperands.clear(); + + // Generate potential match list. matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { Index: test/CodeGen/AMDGPU/sdwa-op64-test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-op64-test.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s + +; GCN-LABEL: {{^}}test_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + + +; GCN-LABEL: {{^}}test_sub_co_sdwa: +; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = sub nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()