Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -815,6 +815,43 @@ return !hasVariantSymbol(Inst); } +/// Return the number of prefixes which can be added to an instruction without +/// encountering decoder delays for the given target. Note that the result is +/// inclusive of escape bytes and manditory prefixes. +static unsigned maximumPrefixPadding(const MCSubtargetInfo &STI) { + // Summary of decoder behavior (per Agner's guide) + // AMD + // K8 and K10 - three prefixes per instruction per cycle (i.e. no stall) + // Bulldozer,Piledriver, Excavator, Steamroller - three prefixes per + // instruction or 10-15 cycle stall + // Ryzen - any number of prefixes in a single cycle + // Bobcat, Jaquar - no penalty for multiple prefixes + // Intel + // SandyBridge and later - can decode any number of prefixes in a single + // cycle. + // Atom - Up to three prefix can be decoded in a single cycle. (Severe + // delay for more) + // Silvermont - can decode three prefixes *and escape sequences* in a + // single cycle (severe stall) + // Goldmont - can decode (most combinations of) four prefixes in a single + // cycle. + // Knights Landing - can decode three prefixes and escape bytes in a single + // cycle (5-6 cycle stall if exceeded) + + // We use maximum nop size as a proxy for the number of prefixes we can + // safely execute. Since nops longer than 8 are formed by adding redundant + // prefxes, this seems like a reasonable proxy. + if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) + // TODO: This should probably be '3' once we validate that escape sequences + // are conservatively handled. + return 0; + else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + return 15; + else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) + return 3; + return 2; +} + static unsigned getRemainingPrefixSize(const MCInst &Inst, const MCSubtargetInfo &STI, MCCodeEmitter &Emitter) { @@ -823,13 +860,11 @@ Emitter.emitPrefix(Inst, VecOS, STI); assert(Code.size() < 15 && "The number of prefixes must be less than 15."); - // TODO: It turns out we need a decent amount of plumbing for the target - // specific bits to determine number of prefixes its safe to add. Various - // targets (older chips mostly, but also Atom family) encounter decoder - // stalls with too many prefixes. For testing purposes, we set the value - // externally for the moment. unsigned ExistingPrefixSize = Code.size(); - unsigned TargetPrefixMax = X86PadMaxPrefixSize; + unsigned TargetPrefixMax = maximumPrefixPadding(STI); + if (X86PadMaxPrefixSize.getNumOccurrences()) + TargetPrefixMax = X86PadMaxPrefixSize; + if (TargetPrefixMax <= ExistingPrefixSize) return 0; return TargetPrefixMax - ExistingPrefixSize; Index: llvm/test/MC/X86/align-branch-64-hardcode.s =================================================================== --- llvm/test/MC/X86/align-branch-64-hardcode.s +++ llvm/test/MC/X86/align-branch-64-hardcode.s @@ -7,7 +7,7 @@ # CHECK: 1d: int3 # CHECK: 1e: jmp - # CHECK: 24: int3 + # CHECK: 26: int3 .p2align 5 .rept 30 int3 Index: llvm/test/MC/X86/align-branch-64-prefix.s =================================================================== --- llvm/test/MC/X86/align-branch-64-prefix.s +++ llvm/test/MC/X86/align-branch-64-prefix.s @@ -6,8 +6,8 @@ .text # CHECK: 1d: int3 - # CHECK: 1e: jmp - # CHECK: 24: int3 + # CHECK-NEXT: 1e: jmp + # CHECK-NEXT: 26: int3 .p2align 5 .rept 30 int3 @@ -17,8 +17,8 @@ int3 # CHECK: 5d: int3 - # CHECK: 5e: jmp - # CHECK: 64: int3 + # CHECK-NEXT: 5e: jmp + # CHECK-NEXT: 66: int3 .p2align 5 .rept 30 int3 @@ -28,8 +28,8 @@ int3 # CHECK: 9d: int3 - # CHECK: 9e: call - # CHECK: a6: int3 + # CHECK-NEXT: 9e: call + # CHECK-NEXT: a6: int3 .p2align 5 .rept 30 int3 @@ -39,8 +39,8 @@ int3 # CHECK: de: lock - # CHECK: df: jmp - # CHECK: e4: int3 + # CHECK-NEXT: df: jmp + # CHECK-NEXT: e6: int3 .p2align 5 .rept 30 int3 @@ -49,9 +49,11 @@ jmp baz int3 + # Note: This one is possible a decoder bug, gnu shows "rex.w, cs cs jmp 0x8" # CHECK: 11d: int3 - # CHECK: 11e: jmp - # CHECK: 124: int3 + # CHECK-NEXT: 11e: cs + # CHECK-NEXT: 120: jmp + # CHECK-NEXT: 126: int3 .p2align 5 .rept 30 int3 @@ -61,8 +63,8 @@ int3 # CHECK: 15d: int3 - # CHECK: 15e: {{.*}} jmp - # CHECK: 164: int3 + # CHECK-NEXT: 15e: {{.*}} jmp + # CHECK-NEXT: 164: int3 .p2align 5 .rept 30 int3