Index: lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -344,10 +344,16 @@ return true; } - uint64_t MaxNopLength = STI.getFeatureBits()[X86::ProcIntelSLM] ? 7 : 15; + // 15-bytes is the longest single nop instruction, but 10-bytes (2 prefixes) + // is commonly the longest that can be efficiently decoded. + uint64_t MaxNopLength = 10; + if (STI.getFeatureBits()[X86::ProcIntelSLM]) + MaxNopLength = 7; + else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + MaxNopLength = 15; - // 15 is the longest single nop instruction. Emit as many 15-byte nops as - // needed, then emit a nop of the remaining length. + // Emit as many MaxNopLengthnops as needed, then emit a nop of the remaining + // length. do { const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -305,8 +305,11 @@ : SubtargetFeature< "fast-lzcnt", "HasFastLZCNT", "true", "LZCNT instructions are as fast as most simple integer ops">; - - +// If the target can efficiently decode NOPs upto 15-bytes in length. +def FeatureFast15ByteNOP + : SubtargetFeature< + "fast-15bytenop", "HasFast15ByteNOP", "true", + "Target can quickly decode up to 15 byte NOPs">; // Sandy Bridge and newer processors can use SHLD with the same source on both // inputs to implement rotate to avoid the partial flag update of the normal // rotate instructions. @@ -874,6 +877,7 @@ FeatureXSAVEOPT, FeatureSlowSHLD, FeatureLAHFSAHF, + FeatureFast15ByteNOP, FeatureFastPartialYMMorZMMWrite ]>; @@ -1004,6 +1008,7 @@ FeatureLAHFSAHF, FeatureLZCNT, FeatureMacroFusion, + FeatureFast15ByteNOP, FeatureMMX, FeatureMOVBE, FeatureMWAITX, Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -246,6 +246,10 @@ /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; + /// True if there is no performance penalty for writing NOPs with up to + /// 15 bytes. + bool HasFast15ByteNOP; + /// True if gather is reasonably fast. This is true for Skylake client and /// all AVX-512 CPUs. bool HasFastGather; Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -335,6 +335,7 @@ HasLZCNTFalseDeps = false; HasFastVariableShuffle = false; HasFastPartialYMMorZMMWrite = false; + HasFast15ByteNOP = false; HasFastGather = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; Index: test/MC/MachO/x86_32-optimal_nop.s =================================================================== --- test/MC/MachO/x86_32-optimal_nop.s +++ test/MC/MachO/x86_32-optimal_nop.s @@ -202,15 +202,15 @@ // CHECK: 0090: C3000000 00000000 00000000 00000000 |................| // CHECK: 00A0: C3C3C3C3 C3C3C366 0F1F8400 00000000 |.......f........| // CHECK: 00B0: C3000000 00000000 00000000 00000000 |................| -// CHECK: 00C0: C3C3C3C3 C366662E 0F1F8400 00000000 |.....ff.........| +// CHECK: 00C0: C3C3C3C3 C3662E0F 1F840000 00000090 |.....f..........| // CHECK: 00D0: C3000000 00000000 00000000 00000000 |................| -// CHECK: 00E0: C3C3C3C3 6666662E 0F1F8400 00000000 |....fff.........| +// CHECK: 00E0: C3C3C3C3 662E0F1F 84000000 00006690 |....f.........f.| // CHECK: 00F0: C3000000 00000000 00000000 00000000 |................| -// CHECK: 0100: C3C3C366 6666662E 0F1F8400 00000000 |...ffff.........| +// CHECK: 0100: C3C3C366 2E0F1F84 00000000 000F1F00 |...f............| // CHECK: 0110: C3000000 00000000 00000000 00000000 |................| -// CHECK: 0120: C3C36666 6666662E 0F1F8400 00000000 |..fffff.........| +// CHECK: 0120: C3C3662E 0F1F8400 00000000 0F1F4000 |..f...........@.| // CHECK: 0130: C3000000 00000000 00000000 00000000 |................| -// CHECK: 0140: C3666666 6666662E 0F1F8400 00000000 |.ffffff.........| +// CHECK: 0140: C3662E0F 1F840000 0000000F 1F440000 |.f...........D..| // CHECK: 0150: C3 |.| // CHECK: ) // CHECK: } @@ -255,7 +255,7 @@ // CHECK: } // CHECK: Segment { // CHECK: Cmd: LC_SEGMENT -// CHECK: Name: +// CHECK: Name: // CHECK: Size: 192 // CHECK: vmaddr: 0x0 // CHECK: vmsize: 0x174 Index: test/MC/X86/AlignedBundling/long-nop-pad.s =================================================================== --- test/MC/X86/AlignedBundling/long-nop-pad.s +++ test/MC/X86/AlignedBundling/long-nop-pad.s @@ -13,17 +13,19 @@ .bundle_lock align_to_end callq bar .bundle_unlock -# To align this group to a bundle end, we need a 15-byte NOP and a 12-byte NOP. +# To align this group to a bundle end, we need a two 10-byte NOPs and a 7-byte NOP. # CHECK: 0: nop -# CHECK-NEXT: f: nop +# CHECK-NEXT: a: nop +# CHECK-NEXT: 14: nop # CHECK: 1b: callq # This push instruction is 1 byte long .bundle_lock align_to_end push %rax .bundle_unlock -# To align this group to a bundle end, we need two 15-byte NOPs, and a 1-byte. +# To align this group to a bundle end, we need three 10-byte NOPs, and a 1-byte. # CHECK: 20: nop -# CHECK-NEXT: 2f: nop +# CHECK-NEXT: 2a: nop +# CHECK-NEXT: 34: nop # CHECK-NEXT: 3e: nop # CHECK-NEXT: 3f: pushq Index: test/MC/X86/AlignedBundling/misaligned-bundle-group.s =================================================================== --- test/MC/X86/AlignedBundling/misaligned-bundle-group.s +++ test/MC/X86/AlignedBundling/misaligned-bundle-group.s @@ -13,9 +13,9 @@ .bundle_lock align_to_end # CHECK: 1: nopw %cs:(%eax,%eax) # CHECK: 10: nopw %cs:(%eax,%eax) -# CHECK-RELAX: 1f: nop +# CHECK-RELAX: 1a: nop # CHECK-RELAX: 20: nopw %cs:(%eax,%eax) -# CHECK-RELAX: 2f: nopw %cs:(%eax,%eax) +# CHECK-RELAX: 2a: nopw %cs:(%eax,%eax) # CHECK-OPT: 1b: calll -4 # CHECK-RELAX: 3b: calll -4 calll bar # 5 bytes Index: test/MC/X86/AlignedBundling/misaligned-bundle.s =================================================================== --- test/MC/X86/AlignedBundling/misaligned-bundle.s +++ test/MC/X86/AlignedBundling/misaligned-bundle.s @@ -12,7 +12,7 @@ .align 16 # CHECK: 1: nopw %cs:(%eax,%eax) # CHECK-RELAX: 10: nopw %cs:(%eax,%eax) -# CHECK-RELAX: 1f: nop +# CHECK-RELAX: 1a: nop # CHECK-OPT: 10: movl $1, (%esp) # CHECK-RELAX: 20: movl $1, (%esp) movl $0x1, (%esp) # 7 bytes Index: test/MC/X86/AlignedBundling/pad-bundle-groups.s =================================================================== --- test/MC/X86/AlignedBundling/pad-bundle-groups.s +++ test/MC/X86/AlignedBundling/pad-bundle-groups.s @@ -38,9 +38,10 @@ callq bar callq bar .bundle_unlock -# And here we'll need a 11-byte NOP +# And here we'll need a 10-byte NOP + 1-byte NOP # CHECK: 30: callq # CHECK: 35: nop +# CHECK: 3f: nop # CHECK-NEXT: 40: callq # CHECK-NEXT: 45: callq Index: test/MC/X86/x86_long_nop.s =================================================================== --- test/MC/X86/x86_long_nop.s +++ test/MC/X86/x86_long_nop.s @@ -1,30 +1,41 @@ -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10 +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10 +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10 +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=silvermont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=NOP1 %s +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=btver2 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15 +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15 +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15 +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15 -# Ensure alignment directives also emit sequences of 15-byte NOPs on processors +# Ensure alignment directives also emit sequences of 11 and 15-byte NOPs on processors # capable of using long NOPs. inc %eax .p2align 5 inc %eax -# CHECK: 0: inc -# CHECK-NEXT: 1: nop -# CHECK-NEXT: 10: nop -# CHECK-NEXT: 1f: nop -# CHECK-NEXT: 20: inc +# LNOP15: 0: inc +# LNOP15-NEXT: 1: nop +# LNOP15-NEXT: 10: nop +# LNOP15-NEXT: 1f: nop +# LNOP15-NEXT: 20: inc + +# LNOP10: 0: inc +# LNOP10-NEXT: 1: nop +# LNOP10-NEXT: b: nop +# LNOP10-NEXT: 15: nop +# LNOP10-NEXT: 1f: nop +# LNOP10-NEXT: 20: inc # On Silvermont we emit only 7 byte NOPs since longer NOPs are not profitable. # LNOP7: 0: inc # LNOP7-NEXT: 1: nop # LNOP7-NEXT: 8: nop # LNOP7-NEXT: f: nop -# LNOP7-NEXT: 16: nop -# LNOP7-NEXT: 1d: nop -# LNOP7-NEXT: 20: inc +# LNOP7-NEXT: 16: nop +# LNOP7-NEXT: 1d: nop +# LNOP7-NEXT: 20: inc # On Lakemont we emit only 1 byte NOPs since longer NOPs are not supported/legal # NOP1: 0: inc