Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2626,6 +2626,10 @@ if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) return false; + // Can it be shrunk to a valid 32 bit opcode? + if (!hasVALU32BitEncoding(MI.getOpcode())) + return false; + // Check output modifiers return !hasModifiersSet(MI, AMDGPU::OpName::omod) && !hasModifiersSet(MI, AMDGPU::OpName::clamp); Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -90,7 +90,10 @@ bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; + bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; + void pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const; + bool VCCUsable(MachineInstr &MI, MachineInstr &MISucc) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -854,7 +857,113 @@ } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, +bool SIPeepholeSDWA::VCCUsable(MachineInstr &MI, MachineInstr &MISucc) const { + // if no reference to VCC in current Func, its usable. + if (MRI->reg_empty(AMDGPU::VCC)) + return true; + MachineBasicBlock *MBB = MI.getParent(); + int cnt = 100; + bool VCCInUse = false; + bool Cand1 = false; + for (MachineBasicBlock::const_instr_iterator I = MBB->instr_begin(), + E = MBB->instr_end(); + I != E; ++I) { + const MachineInstr &Instr = *I; + if (!(--cnt)) + break; + // Track VCC def and use within the block until we find our candidate + // instructions, then determine if VCC is in use or not. + if (Instr.modifiesRegister(AMDGPU::VCC, TRI)) { + VCCInUse = true; + if (Instr.readsRegister(AMDGPU::VCC, TRI)) + VCCInUse = false; + } + if (&Instr == &MI) + Cand1 = true; + if (&Instr == &MISucc) + return Cand1 && !VCCInUse; + } + // Can't tell if VCC is safe to use. + return false; +} + +// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and +// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA +// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. +// +// We are transforming from a VOP3 into a VOP2 form of the instruction. +// %19:vgpr_32 = V_AND_B32_e32 255, +// killed %16:vgpr_32, implicit $exec +// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 +// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec +// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 +// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec +// +// becomes +// %47:vgpr_32 = V_ADD_I32_sdwa +// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, +// implicit-def $vcc, implicit $exec +// %48:vgpr_32 = V_ADDC_U32_e32 +// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec +void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const { + int Opc = MI.getOpcode(); + assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && + "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); + + // Can the candidate MI be shrunk? + if (!TII->canShrink(MI, *MRI)) + return; + Opc = AMDGPU::getVOPe32(Opc); + // Find the related ADD instruction. + const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Sdst) + return; + MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); + if (!NextOp) + return; + MachineInstr &MISucc = *NextOp->getParent(); + // Can the successor be shrunk? + if (!TII->canShrink(MISucc, *MRI)) + return; + int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); + // Make sure the carry in/out are subsequently unused. + MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); + if (!CarryIn) + return; + MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); + if (!CarryOut) + return; + if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) + return; + if (!VCCUsable(MI,MISucc)) + return; + // Give up if any mods on MI or MISucc. + if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) + return; + if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) + return; + if (auto *Mod = TII->getNamedOperand(MISucc, AMDGPU::OpName::src0_modifiers)) + return; + if (auto *Mod = TII->getNamedOperand(MISucc, AMDGPU::OpName::src1_modifiers)) + return; + // Make the two new e32 instruction variants. + // Replace MI with V_{SUB|ADD}_I32_e32 + auto NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); + MI.eraseFromParent(); + // Replace MISucc with V_{SUBB|ADDC}_U32_e32 + auto NewInst = BuildMI(*MISucc.getParent(), MISucc, MISucc.getDebugLoc(), + TII->get(SuccOpc)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); + MISucc.eraseFromParent(); +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); @@ -1127,6 +1236,22 @@ for (MachineBasicBlock &MBB : MF) { bool Changed = false; do { + // Preprocess the ADD/SUB pairs so they could be SDWA'ed. + // Look for a possible ADD or SUB that resulted from a previously lowered + // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 + // lowers the pair of instructions into e32 form. + matchSDWAOperands(MBB); + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && + (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || + PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) + pseudoOpConvertToVOP2(*PotentialMI, ST); + } + SDWAOperands.clear(); + + // Generate potential match list. matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { Index: test/CodeGen/AMDGPU/sdwa-op64-test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-op64-test.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s + +; GCN-LABEL: {{^}}test_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + + +; GCN-LABEL: {{^}}test_sub_co_sdwa: +; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = sub nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + +; GCN-LABEL: {{^}}test1_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test1_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1, i64 addrspace(1)* %arg2) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 + %tmp15 = and i32 %tmp14, 255 + %tmp16 = zext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i64, i64 addrspace(1)* %arg2, i32 %tmp + %tmp18 = load i64, i64 addrspace(1)* %tmp17, align 8 + %tmp19 = add nsw i64 %tmp18, %tmp16 + store i64 %tmp19, i64 addrspace(1)* %tmp17, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/sdwa-ops.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-ops.mir @@ -0,0 +1,237 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s + +# test for 3 consecutive _sdwa's +# GFX9-LABEL: name: test1_add_co_sdwa +# GFX9: V_ADD_I32_sdwa +# GFX9-NEXT: V_ADDC_U32_e32 +# GFX9: V_ADD_I32_sdwa +# GFX9-NEXT: V_ADDC_U32_e32 +# GFX9: V_ADD_I32_sdwa +# GFX9-NEXT: V_ADDC_U32_e32 +--- +name: test1_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec + %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec + %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %162, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + %171:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %173:vgpr_32, %175:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %171, implicit $exec + %174:vgpr_32, dead %176:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %175, implicit $exec + %172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %172, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for VCC interference on sdwa, should generate 1 xform only +# GFX9-LABEL: name: test2_add_co_sdwa +# GFX9: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e32 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test2_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + + %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec + %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec + %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 + + %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec + %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec + %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %162, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for CarryOut used, should reject +# GFX9-LABEL: name: test3_add_co_sdwa +# GFX9: V_ADD_I32_e64 +# GFX9: V_ADDC_U32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test3_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %66, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for CarryIn used more than once, should reject +# GFX9-LABEL: name: test4_add_co_sdwa +# GFX9: V_ADD_I32_e64 +# GFX9: V_ADDC_U32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test4_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %65, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for simple example, should generate sdwa +# GFX9-LABEL: name: test5_add_co_sdwa +# GFX9: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e32 +--- +name: test5_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for V_ADD_I32_e64 only, should reject +# GFX9-LABEL: name: test6_add_co_sdwa +# GFX9: V_ADD_I32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test6_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %23, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for V_ADDC_U32_e64 only, should reject +# GFX9-LABEL: name: test7_add_co_sdwa +# GFX9: V_ADDC_U32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test7_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %24:sreg_64_xexec = COPY $sgpr0_sgpr1 + + %30:vreg_64 = COPY $sgpr0_sgpr1 + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %24, implicit $exec + %62:vreg_64 = REG_SEQUENCE %23, %subreg.sub0, %23, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +