Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -91,6 +91,7 @@ void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; + bool opConvertedToVOP2(const MachineInstr &MI, const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -854,6 +855,80 @@ } } +// Determine if we can convert V_ADDC_U32_e64 into V_ADDC_U32_e32, +// and return true after converting the instruction into its SDWA-able +// form. This allows isConvertibleToSDWA to perform its transformation on +// V_ADD_I32_e64 and morph it into V_ADD_I32_sdwa. +// +// We are transforming from a VOP3 into a VOP2 form of the instruction. +// %19:vgpr_32 = V_AND_B32_e32 255, +// killed %16:vgpr_32, implicit $exec +// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 +// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec +// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 +// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec +// +// becomes +// %47:vgpr_32 = V_ADD_I32_sdwa +// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, +// implicit-def $vcc, implicit $exec +// %48:vgpr_32 = V_ADDC_U32_e32 +// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec + +bool SIPeepholeSDWA::opConvertedToVOP2(const MachineInstr &MI, + const GCNSubtarget &ST) const { + // Instr has an e32 equivalent ? + int Opc = MI.getOpcode(); + Opc = AMDGPU::getVOPe32(Opc); + if (Opc == -1) + return false; + // Find the related ADD instruction. + LLVM_DEBUG(dbgs() << "MI: " << MI << '\n'); + const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Sdst) + return false; + MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); + if (!NextOp) + return false; + MachineInstr &MISucc = *NextOp->getParent(); + LLVM_DEBUG(dbgs() << "MISucc: " << MISucc << '\n'); + // Successor instruction has vop2 form? + Opc = MISucc.getOpcode(); + Opc = AMDGPU::getVOPe32(Opc); + if (Opc == -1) + return false; + const MCInstrDesc &SDWADesc = TII->get(Opc); + MachineOperand *Vdst = TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst); + if (!Vdst) + return false; + MachineOperand *Src0 = TII->getNamedOperand(MISucc, AMDGPU::OpName::src0); + if (!Src0) + return false; + MachineOperand *Src1 = TII->getNamedOperand(MISucc, AMDGPU::OpName::src1); + if (!Src1) + return false; + // Make sure the carry in/out are subsequently unused. + MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); + if (!CarryIn) + return false; + MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); + if (!CarryOut) + return false; + if (!CarryIn->isKill()) + return false; + if (!CarryOut->isDead()) + return false; + // Make the new instruction. + auto NewInst = + BuildMI(*MISucc.getParent(), MISucc, MISucc.getDebugLoc(), SDWADesc); + NewInst.add(*Vdst); + NewInst.add(*Src0); + NewInst.add(*Src1); + LLVM_DEBUG(dbgs() << "NewInst: " << *NewInst << '\n'); + MISucc.eraseFromParent(); + return true; +} + bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction @@ -883,6 +958,10 @@ TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) return false; + } else if (MI.getOpcode() == AMDGPU::V_ADD_I32_e64 || + MI.getOpcode() == AMDGPU::V_SUB_I32_e64) { + if (!opConvertedToVOP2(MI, ST)) + return false; } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { return false; Index: test/CodeGen/AMDGPU/sdwa-op64-test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-op64-test.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s + +; GCN-LABEL: {{^}}test_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + + +; GCN-LABEL: {{^}}test_sub_co_sdwa: +; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = sub nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()