Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -213,6 +213,7 @@ for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; + MachineInstr &NextMI = *Next; // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { @@ -249,6 +250,29 @@ } } } + // Combine adjacent s_nops to use the immediate operand encoding how long + // to wait. + // + // s_nop N + // s_nop M + // => + // s_nop (N + M) + if (MI.getOpcode() == AMDGPU::S_NOP && + NextMI.getOpcode() == AMDGPU::S_NOP) { + // The instruction encodes the amount to wait with an offset of 1, + // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back + // after adding. + uint8_t Nop0 = MI.getOperand(0).getImm() + 1; + uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; + + // Make sure we don't overflow the bounds. + if (Nop0 + Nop1 <= 8) { + NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); + MI.eraseFromParent(); + } + + continue; + } if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue;