diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -264,15 +264,21 @@ const uint32_t Encoded_s_code_end = 0xbf9f0000; const uint32_t Encoded_s_nop = 0xbf800000; uint32_t Encoded_pad = Encoded_s_code_end; - unsigned FillSize = 48; + + // Instruction cache line size in bytes. + const unsigned Log2CacheLineSize = 6; + const unsigned CacheLineSize = 1u << Log2CacheLineSize; + + // Enough padding to support prefetch mode 3. + unsigned FillSize = 3 * CacheLineSize; if (AMDGPU::isGFX90A(STI)) { Encoded_pad = Encoded_s_nop; - FillSize = 256; + FillSize = 16 * CacheLineSize; } - OS << "\t.p2alignl 6, " << Encoded_pad << '\n'; - OS << "\t.fill " << FillSize << ", 4, " << Encoded_pad << '\n'; + OS << "\t.p2alignl " << Log2CacheLineSize << ", " << Encoded_pad << '\n'; + OS << "\t.fill " << (FillSize / 4) << ", 4, " << Encoded_pad << '\n'; return true; } @@ -641,17 +647,23 @@ const uint32_t Encoded_s_code_end = 0xbf9f0000; const uint32_t Encoded_s_nop = 0xbf800000; uint32_t Encoded_pad = Encoded_s_code_end; - unsigned FillSize = 48; + + // Instruction cache line size in bytes. + const unsigned Log2CacheLineSize = 6; + const unsigned CacheLineSize = 1u << Log2CacheLineSize; + + // Enough padding to support prefetch mode 3. + unsigned FillSize = 3 * CacheLineSize; if (AMDGPU::isGFX90A(STI)) { Encoded_pad = Encoded_s_nop; - FillSize = 256; + FillSize = 16 * CacheLineSize; } MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.emitValueToAlignment(64, Encoded_pad, 4); - for (unsigned I = 0; I < FillSize; ++I) + OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4); + for (unsigned I = 0; I < FillSize; I += 4) OS.emitInt32(Encoded_pad); OS.PopSection(); return true;