diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -156,6 +156,7 @@ DenseMap Instructions; DenseMap Blocks; SmallVector LiveMaskQueries; + SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; void printInfo(); @@ -352,7 +353,7 @@ // inactive lanes. markInstructionUses(MI, StateWWM, Worklist); GlobalFlags |= StateWWM; - LowerToCopyInstrs.push_back(&MI); + LowerToMovInstrs.push_back(&MI); continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { @@ -852,7 +853,7 @@ } void SIWholeQuadMode::lowerCopyInstrs() { - for (MachineInstr *MI : LowerToCopyInstrs) { + for (MachineInstr *MI : LowerToMovInstrs) { for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) MI->RemoveOperand(i); @@ -872,6 +873,12 @@ MI->setDesc(TII->get(AMDGPU::COPY)); } } + for (MachineInstr *MI : LowerToCopyInstrs) { + for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) + MI->RemoveOperand(i); + + MI->setDesc(TII->get(AMDGPU::COPY)); + } } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { @@ -879,6 +886,7 @@ Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); + LowerToMovInstrs.clear(); CallingConv = MF.getFunction().getCallingConv(); ST = &MF.getSubtarget(); @@ -893,7 +901,7 @@ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty()) + if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) return !LiveMaskQueries.empty(); } else { // Store a copy of the original live mask when required diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -117,6 +117,9 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 +; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this +; does not happen - the v_add should write the return reg directly. +;CHECK-NOT: v_mov_b32_e32 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)