Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -815,6 +815,43 @@ return !hasVariantSymbol(Inst); } +/// Return the number of prefixes which can be added to an instruction without +/// encountering decoder delays for the given target. Note that the result is +/// inclusive of escape bytes and manditory prefixes. +static unsigned maximumPrefixPadding(const MCSubtargetInfo &STI) { + // Summary of decoder behavior (per Agner's guide) + // AMD + // K8 and K10 - three prefixes per instruction per cycle (i.e. no stall) + // Bulldozer,Piledriver, Excavator, Steamroller - three prefixes per + // instruction or 10-15 cycle stall + // Ryzen - any number of prefixes in a single cycle + // Bobcat, Jaquar - no penalty for multiple prefixes + // Intel + // SandyBridge and later - can decode any number of prefixes in a single + // cycle. + // Atom - Up to three prefix can be decoded in a single cycle. (Severe + // delay for more) + // Silvermont - can decode three prefixes *and escape sequences* in a + // single cycle (severe stall) + // Goldmont - can decode (most combinations of) four prefixes in a single + // cycle. + // Knights Landing - can decode three prefixes and escape bytes in a single + // cycle (5-6 cycle stall if exceeded) + + // We use maximum nop size as a proxy for the number of prefixes we can + // safely execute. Since nops longer than 8 are formed by adding redundant + // prefxes, this seems like a reasonable proxy. + if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) + // TODO: This should probably be '3' once we validate that escape sequences + // are conservatively handled. + return 0; + else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + return 15; + else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) + return 3; + return 2; +} + static unsigned getRemainingPrefixSize(const MCInst &Inst, const MCSubtargetInfo &STI, MCCodeEmitter &Emitter) { @@ -823,13 +860,10 @@ Emitter.emitPrefix(Inst, VecOS, STI); assert(Code.size() < 15 && "The number of prefixes must be less than 15."); - // TODO: It turns out we need a decent amount of plumbing for the target - // specific bits to determine number of prefixes its safe to add. Various - // targets (older chips mostly, but also Atom family) encounter decoder - // stalls with too many prefixes. For testing purposes, we set the value - // externally for the moment. - unsigned ExistingPrefixSize = Code.size(); - unsigned TargetPrefixMax = X86PadMaxPrefixSize; + const unsigned ExistingPrefixSize = Code.size(); + const unsigned TargetPrefixMax = + std::min(maximumPrefixPadding(STI), (unsigned)X86PadMaxPrefixSize); + if (TargetPrefixMax <= ExistingPrefixSize) return 0; return TargetPrefixMax - ExistingPrefixSize; Index: llvm/test/MC/X86/prefix-padding-32.s =================================================================== --- llvm/test/MC/X86/prefix-padding-32.s +++ llvm/test/MC/X86/prefix-padding-32.s @@ -2,22 +2,22 @@ # Check prefix padding generation for all cases on 32 bit x86. -# CHECK: 1: 3e 3e 3e 3e 3e 3e 3e 3e 3e 81 e1 01 00 00 00 andl $1, %ecx -# CHECK: 10: 3e 3e 3e 3e 3e 3e 3e 3e 3e 81 21 01 00 00 00 andl $1, %ds:(%ecx) -# CHECK: 1f: 2e 2e 2e 2e 2e 2e 2e 2e 2e 81 21 01 00 00 00 andl $1, %cs:(%ecx) -# CHECK: 2e: 3e 3e 3e 3e 3e 3e 3e 3e 3e 81 21 01 00 00 00 andl $1, %ds:(%ecx) -# CHECK: 3d: 26 26 26 26 26 26 26 26 26 81 21 01 00 00 00 andl $1, %es:(%ecx) -# CHECK: 4c: 64 64 64 64 64 64 64 64 64 81 21 01 00 00 00 andl $1, %fs:(%ecx) -# CHECK: 5b: 65 65 65 65 65 65 65 65 65 81 21 01 00 00 00 andl $1, %gs:(%ecx) -# CHECK: 6a: 36 36 36 36 36 36 36 36 36 81 21 01 00 00 00 andl $1, %ss:(%ecx) -# CHECK: 79: 3e 3e 3e 3e 3e 81 a1 00 00 00 00 01 00 00 00 andl $1, %ds:(%ecx) -# CHECK: 88: 3e 3e 3e 3e 3e 81 a1 00 00 00 00 01 00 00 00 andl $1, %ds:(%ecx) -# CHECK: 97: 36 36 36 36 36 36 36 36 81 24 24 01 00 00 00 andl $1, %ss:(%esp) -# CHECK: a6: 65 65 65 65 65 65 65 65 81 24 24 01 00 00 00 andl $1, %gs:(%esp) -# CHECK: b5: 36 36 36 36 81 a4 24 00 00 00 00 01 00 00 00 andl $1, %ss:(%esp) -# CHECK: c4: 36 36 36 36 36 36 36 36 81 65 00 01 00 00 00 andl $1, %ss:(%ebp) -# CHECK: d3: 65 65 65 65 65 65 65 65 81 65 00 01 00 00 00 andl $1, %gs:(%ebp) -# CHECK: e2: 36 36 36 36 36 81 a5 00 00 00 00 01 00 00 00 andl $1, %ss:(%ebp) +# CHECK: 1: 3e 3e 81 e1 01 00 00 00 andl $1, %ecx +# CHECK: 9: 3e 3e 81 21 01 00 00 00 andl $1, %ds:(%ecx) +# CHECK: 11: 2e 2e 81 21 01 00 00 00 andl $1, %cs:(%ecx) +# CHECK: 19: 3e 3e 81 21 01 00 00 00 andl $1, %ds:(%ecx) +# CHECK: 21: 26 26 81 21 01 00 00 00 andl $1, %es:(%ecx) +# CHECK: 29: 64 64 81 21 01 00 00 00 andl $1, %fs:(%ecx) +# CHECK: 31: 65 65 81 21 01 00 00 00 andl $1, %gs:(%ecx) +# CHECK: 39: 36 36 81 21 01 00 00 00 andl $1, %ss:(%ecx) +# CHECK: 41: 3e 3e 81 a1 00 00 00 00 01 00 00 00 andl $1, %ds:(%ecx) +# CHECK: 4d: 3e 3e 81 a1 00 00 00 00 01 00 00 00 andl $1, %ds:(%ecx) +# CHECK: 59: 36 36 81 24 24 01 00 00 00 andl $1, %ss:(%esp) +# CHECK: 62: 65 65 81 24 24 01 00 00 00 andl $1, %gs:(%esp) +# CHECK: 6b: 36 36 81 a4 24 00 00 00 00 01 00 00 00 andl $1, %ss:(%esp) +# CHECK: 78: 36 36 81 65 00 01 00 00 00 andl $1, %ss:(%ebp) +# CHECK: 81: 65 65 81 65 00 01 00 00 00 andl $1, %gs:(%ebp) +# CHECK: 8a: 36 36 81 a5 00 00 00 00 01 00 00 00 andl $1, %ss:(%ebp) .text .section .text .p2align 8