Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -84,7 +84,7 @@ // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add // a special case for it. It can only be shrunk if the third operand // is vcc. We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrining + // a register allocation hint pre-regalloc and then do the shrinking // post-regalloc. if (Src2) { switch (MI.getOpcode()) { @@ -456,6 +456,16 @@ continue; } + // Check for the bool flag output for instructions like V_ADD_I32_e64. + const MachineOperand *SDst = TII->getNamedOperand(MI, + AMDGPU::OpName::sdst); + if (SDst && SDst->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) + MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); + + continue; + } + // We can shrink this instruction DEBUG(dbgs() << "Shrinking " << MI); Index: test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -0,0 +1,305 @@ +# RUN: llc -verify-machineinstrs -march=amdgcn -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s +# Check that add with carry out isn't incorrectly reduced to e32 when +# the carry out is a virtual register. + +# TODO: We should run this test until the end of codegen to make sure +# that the post-RA run does manage to shrink it, but right now the +# resume crashes + +--- | + define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.ptr + %b = load volatile i32, i32 addrspace(1)* %b.ptr + %result = add i32 %a, %b + store volatile i32 %result, i32 addrspace(1)* %out.gep + ret void + } + + define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.ptr + %b = load volatile i32, i32 addrspace(1)* %b.ptr + %result = sub i32 %a, %b + store volatile i32 %result, i32 addrspace(1)* %out.gep + ret void + } + + define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.ptr + %b = load volatile i32, i32 addrspace(1)* %b.ptr + %result = sub i32 %a, %b + store volatile i32 %result, i32 addrspace(1)* %out.gep + ret void + } + + declare i32 @llvm.amdgcn.workitem.id.x() #1 + + attributes #0 = { nounwind } + attributes #1 = { nounwind readnone } + +... +--- +# GCN-LABEL: name: shrink_add_vop3{{$}} +# GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec +# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec +name: shrink_add_vop3 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32 } + - { id: 23, class: sreg_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %27 = REG_SEQUENCE %3, 1, %26, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %28 = V_LSHL_B64 killed %27, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec + %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + S_ENDPGM + +... +--- +# GCN-LABEL: name: shrink_sub_vop3{{$}} +# GCN: %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec +# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec + +name: shrink_sub_vop3 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32 } + - { id: 23, class: sreg_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %27 = REG_SEQUENCE %3, 1, %26, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %28 = V_LSHL_B64 killed %27, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec + %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + S_ENDPGM + +... +--- +# GCN-LABEL: name: shrink_subrev_vop3{{$}} +# GCN: %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec +# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec + +name: shrink_subrev_vop3 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32 } + - { id: 23, class: sreg_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %27 = REG_SEQUENCE %3, 1, %26, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %28 = V_LSHL_B64 killed %27, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec + %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + S_ENDPGM + +...