Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -90,6 +90,11 @@ switch (MI.getOpcode()) { default: return false; + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + // Additional verification is needed for sdst/src2. + return true; + case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: if (!isVGPR(Src2, TRI, MRI) || @@ -174,7 +179,7 @@ const MachineOperand &Orig) { for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.getReg() == AMDGPU::VCC) { + if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { Use.setIsUndef(Orig.isUndef()); Use.setIsKill(Orig.isKill()); return; @@ -459,11 +464,26 @@ // Check for the bool flag output for instructions like V_ADD_I32_e64. const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - if (SDst && SDst->getReg() != AMDGPU::VCC) { - if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) - MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); - continue; + // Check the carry-in operand for v_addc_u32_e64. + const MachineOperand *Src2 = TII->getNamedOperand(MI, + AMDGPU::OpName::src2); + + if (SDst) { + if (SDst->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) + MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); + continue; + } + + // All of the instructions with carry outs also have an SGPR input in + // src2. + if (Src2 && Src2->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) + MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); + + continue; + } } // We can shrink this instruction @@ -491,8 +511,6 @@ if (Src1) Inst32.addOperand(*Src1); - const MachineOperand *Src2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { Index: test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir =================================================================== --- test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -46,6 +46,45 @@ ret void } + define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.ptr + %b = load volatile i32, i32 addrspace(1)* %b.ptr + %result = add i32 %a, %b + store volatile i32 %result, i32 addrspace(1)* %out.gep + ret void + } + + define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.ptr + %b = load volatile i32, i32 addrspace(1)* %b.ptr + %result = add i32 %a, %b + store volatile i32 %result, i32 addrspace(1)* %out.gep + ret void + } + + define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.ptr + %b = load volatile i32, i32 addrspace(1)* %b.ptr + %result = add i32 %a, %b + store volatile i32 %result, i32 addrspace(1)* %out.gep + ret void + } + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } @@ -303,3 +342,256 @@ S_ENDPGM ... +--- +# GCN-LABEL: name: check_addc_src2_vop3{{$}} +# GCN: %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec +# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec +name: check_addc_src2_vop3 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32 } + - { id: 23, class: sreg_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %27 = REG_SEQUENCE %3, 1, %26, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %28 = V_LSHL_B64 killed %27, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %9 = S_MOV_B64 0 + %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec + %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + S_ENDPGM + +... +--- +# GCN-LABEL: name: shrink_addc_vop3{{$}} +# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec +# GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec + +name: shrink_addc_vop3 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32 } + - { id: 23, class: sreg_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %27 = REG_SEQUENCE %3, 1, %26, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %28 = V_LSHL_B64 killed %27, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %vcc = S_MOV_B64 0 + %29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec + %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + S_ENDPGM + +... + +--- +# GCN-LABEL: name: shrink_addc_undef_vcc{{$}} +# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec +# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec +name: shrink_addc_undef_vcc +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32 } + - { id: 23, class: sreg_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %27 = REG_SEQUENCE %3, 1, %26, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %28 = V_LSHL_B64 killed %27, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec + %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + S_ENDPGM + +...