diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -156,6 +156,7 @@ DenseMap Instructions; DenseMap Blocks; SmallVector LiveMaskQueries; + SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; void printInfo(); @@ -352,7 +353,7 @@ // inactive lanes. markInstructionUses(MI, StateWWM, Worklist); GlobalFlags |= StateWWM; - LowerToCopyInstrs.push_back(&MI); + LowerToMovInstrs.push_back(&MI); continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { @@ -852,9 +853,8 @@ } void SIWholeQuadMode::lowerCopyInstrs() { - for (MachineInstr *MI : LowerToCopyInstrs) { - for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) - MI->RemoveOperand(i); + for (MachineInstr *MI : LowerToMovInstrs) { + assert(MI->getNumExplicitOperands() == 2); const Register Reg = MI->getOperand(0).getReg(); @@ -872,6 +872,22 @@ MI->setDesc(TII->get(AMDGPU::COPY)); } } + for (MachineInstr *MI : LowerToCopyInstrs) { + if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { + assert(MI->getNumExplicitOperands() == 3); + // the only reason we should be here is V_SET_INACTIVE has + // an undef input so it is being replaced by a simple copy. + // There should be a second undef source that we should remove. + assert(MI->getOperand(2).isUndef()); + MI->RemoveOperand(2); + MI->untieRegOperand(1); + } else { + assert(MI->getNumExplicitOperands() == 2); + } + + MI->setDesc(TII->get(AMDGPU::COPY)); + } } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { @@ -879,6 +895,7 @@ Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); + LowerToMovInstrs.clear(); CallingConv = MF.getFunction().getCallingConv(); ST = &MF.getSubtarget(); @@ -893,7 +910,7 @@ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty()) + if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) return !LiveMaskQueries.empty(); } else { // Store a copy of the original live mask when required diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -117,6 +117,9 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 +; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this +; does not happen - the v_add should write the return reg directly. +;CHECK-NOT: v_mov_b32_e32 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -48,3 +48,62 @@ SI_RETURN_TO_EPILOG $vgpr0 ... + +--- +# V_SET_INACTIVE, when it's second operand is undef, is replaced by a +# COPY by si-wqm. Ensure the instruction is removed. +#CHECK-NOT: V_SET_INACTIVE +name: no_cfg +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: sgpr_32, preferred-register: '' } + - { id: 1, class: sgpr_32, preferred-register: '' } + - { id: 2, class: sgpr_32, preferred-register: '' } + - { id: 3, class: sgpr_32, preferred-register: '' } + - { id: 4, class: sgpr_32, preferred-register: '' } + - { id: 5, class: sgpr_128, preferred-register: '' } + - { id: 6, class: sgpr_128, preferred-register: '' } + - { id: 7, class: sreg_32, preferred-register: '' } + - { id: 8, class: vreg_64, preferred-register: '' } + - { id: 9, class: sreg_32, preferred-register: '' } + - { id: 10, class: vgpr_32, preferred-register: '' } + - { id: 11, class: vgpr_32, preferred-register: '' } + - { id: 12, class: sreg_32, preferred-register: '' } + - { id: 13, class: vgpr_32, preferred-register: '' } + - { id: 14, class: vgpr_32, preferred-register: '' } + - { id: 15, class: vgpr_32, preferred-register: '' } + - { id: 16, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$sgpr0', virtual-reg: '%0' } + - { reg: '$sgpr1', virtual-reg: '%1' } + - { reg: '$sgpr2', virtual-reg: '%2' } + - { reg: '$sgpr3', virtual-reg: '%3' } +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 + %5:sgpr_128 = COPY %6 + %7:sreg_32 = S_MOV_B32 0 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec + %16:vgpr_32 = COPY %8.sub1 + %11:vgpr_32 = COPY %16 + %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec + %14:vgpr_32 = COPY %7 + %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec + early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/utils/chunk-print-before-all.py b/llvm/utils/chunk-print-before-all.py --- a/llvm/utils/chunk-print-before-all.py +++ b/llvm/utils/chunk-print-before-all.py @@ -5,38 +5,52 @@ # "crashinfo.txt" file leaving only the valid input IR in the last chunk. # Files are written to current working directory. +from __future__ import print_function import sys +import re + basename = "chunk-" chunk_id = 0 -def print_chunk(lines): +def print_chunk(lines, name): global chunk_id global basename - fname = basename + str(chunk_id) + ".ll" + fixed_name = name.replace(" ", "_") + fixed_name = re.sub("[^\w]", "", fixed_name) + fname = basename + str(chunk_id) + "_" + fixed_name + ".ll" chunk_id = chunk_id + 1 - print "writing chunk " + fname + " (" + str(len(lines)) + " lines)" + print("writing chunk " + fname + " (" + str(len(lines)) + " lines)") with open(fname, "w") as f: f.writelines(lines) is_dump = False cur = [] -for line in sys.stdin: - if line.startswith("*** IR Dump Before ") and len(cur) != 0: - print_chunk(cur); +name = "unnamed" +if len(sys.argv) == 1: + dfile = sys.stdin +else: + dfile = open(sys.argv[1], "r") + +for line in dfile: + match = re.match("[#;]* *\*\*\* IR Dump (After|Before) (.*) \*\*\*", line) + if match != None and len(cur) != 0: + print_chunk(cur, name) cur = [] cur.append("; " + line) elif line.startswith("Stack dump:"): - print_chunk(cur); + print_chunk(cur, name) cur = [] cur.append(line) is_dump = True else: cur.append(line) + if match != None: + name = match.group(2) if is_dump: - print "writing crashinfo.txt (" + str(len(cur)) + " lines)" + print("writing crashinfo.txt (" + str(len(cur)) + " lines)") with open("crashinfo.txt", "w") as f: f.writelines(cur) else: - print_chunk(cur); + print_chunk(cur, name)