Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -91,6 +91,8 @@ void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; + bool pseudoOpConvertedToVOP2(const MachineInstr &MI, + const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -854,6 +856,80 @@ } } +// Determine if we can convert V_ADDC_U32_e64 into V_ADDC_U32_e32, +// and return true after converting the instruction into its SDWA-able +// form. This allows isConvertibleToSDWA to perform its transformation on +// V_ADD_I32_e64 and morph it into V_ADD_I32_sdwa. +// +// We are transforming from a VOP3 into a VOP2 form of the instruction. +// %19:vgpr_32 = V_AND_B32_e32 255, +// killed %16:vgpr_32, implicit $exec +// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 +// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec +// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 +// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec +// +// becomes +// %47:vgpr_32 = V_ADD_I32_sdwa +// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, +// implicit-def $vcc, implicit $exec +// %48:vgpr_32 = V_ADDC_U32_e32 +// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec +bool SIPeepholeSDWA::pseudoOpConvertedToVOP2(const MachineInstr &MI, + const GCNSubtarget &ST) const { + int Opc = MI.getOpcode(); + assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && + "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); + + // Instr has an e32 equivalent ? + Opc = AMDGPU::getVOPe32(Opc); + if (Opc == -1) + return false; + Opc = TII->pseudoToMCOpcode(Opc); + if (Opc == -1) + return false; + // Find the related ADD instruction. + LLVM_DEBUG(dbgs() << "MI: " << MI << '\n'); + const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Sdst) + return false; + MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); + if (!NextOp) + return false; + MachineInstr &MISucc = *NextOp->getParent(); + LLVM_DEBUG(dbgs() << "MISucc: " << MISucc << '\n'); + // Successor instruction has vop2 form? + Opc = MISucc.getOpcode(); + Opc = AMDGPU::getVOPe32(Opc); + if (Opc == -1) + return false; + Opc = TII->pseudoToMCOpcode(Opc); + if (Opc == -1) + return false; + const MCInstrDesc &VOP2Desc = TII->get(Opc); + MachineOperand *Vdst = TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst); + MachineOperand *Src0 = TII->getNamedOperand(MISucc, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(MISucc, AMDGPU::OpName::src1); + // Make sure the carry in/out are subsequently unused. + MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); + if (!CarryIn) + return false; + MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); + if (!CarryOut) + return false; + if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) + return false; + // Make the new instruction. + auto NewInst = + BuildMI(*MISucc.getParent(), MISucc, MISucc.getDebugLoc(), VOP2Desc); + NewInst.add(*Vdst); + NewInst.add(*Src0); + NewInst.add(*Src1); + LLVM_DEBUG(dbgs() << "NewInst: " << *NewInst << '\n'); + MISucc.eraseFromParent(); + return true; +} + bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction @@ -883,6 +959,14 @@ TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) return false; + // Look for a possible ADD or SUB that resulted from a previously lowered + // V_ADD_U64_PSEUDO or V_SUB_U64_PSEUDO. The function pseudoOpConvertedToVOP2 + // further validates that we have a lowered pseudo and returns true if it was + // able to perform the conversion. + } else if (MI.getOpcode() == AMDGPU::V_ADD_I32_e64 || + MI.getOpcode() == AMDGPU::V_SUB_I32_e64) { + if (!pseudoOpConvertedToVOP2(MI, ST)) + return false; } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { return false; Index: test/CodeGen/AMDGPU/sdwa-op64-test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-op64-test.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s + +; GCN-LABEL: {{^}}test_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + + +; GCN-LABEL: {{^}}test_sub_co_sdwa: +; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_subb_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = sub nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()