diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h --- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h +++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h @@ -171,6 +171,29 @@ // It should help move much of the target specific code from llvm-objdump to // respective target disassemblers. + /// Suggest a distance to skip in a buffer of data to find the next + /// place to look for the start of an instruction. For example, if + /// all instructions have a fixed alignment, this might advance to + /// the next multiple of that alignment. + /// + /// If not overridden, the default is 1. + /// + /// \param Address - The address, in the memory space of region, of the + /// starting point (typically the first byte of something + /// that did not decode as a valid instruction at all). + /// \param Bytes - A reference to the actual bytes at Address. May be + /// needed in order to determine the width of an + /// unrecognized instruction (e.g. in Thumb this is a simple + /// consistent criterion that doesn't require knowing the + /// specific instruction). The caller can pass as much data + /// as they have available, and the function is required to + /// make a reasonable default choice if not enough data is + /// available to make a better one. + /// \return - A number of bytes to skip. Must always be greater than + /// zero. May be greater than the size of Bytes. + virtual uint64_t suggestBytesToSkip(ArrayRef Bytes, + uint64_t Address) const; + private: MCContext &Ctx; diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp --- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp +++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp @@ -20,6 +20,11 @@ return None; } +uint64_t MCDisassembler::suggestBytesToSkip(ArrayRef Bytes, + uint64_t Address) const { + return 1; +} + bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value, uint64_t Address, bool IsBranch, uint64_t Offset, uint64_t OpSize, diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -30,6 +30,9 @@ MCDisassembler::DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, raw_ostream &CStream) const override; + + uint64_t suggestBytesToSkip(ArrayRef Bytes, + uint64_t Address) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -350,6 +350,14 @@ return MCDisassembler::Fail; } +uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef Bytes, + uint64_t Address) const { + // AArch64 instructions are always 4 bytes wide, so there's no point + // in skipping any smaller number of bytes if an instruction can't + // be decoded. + return 4; +} + static MCSymbolizer * createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, LLVMSymbolLookupCallback SymbolLookUp, diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -139,6 +139,9 @@ ArrayRef Bytes, uint64_t Address, raw_ostream &CStream) const override; + uint64_t suggestBytesToSkip(ArrayRef Bytes, + uint64_t Address) const override; + private: DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, @@ -739,6 +742,33 @@ } } +uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef Bytes, + uint64_t Address) const { + // In Arm state, instructions are always 4 bytes wide, so there's no + // point in skipping any smaller number of bytes if an instruction + // can't be decoded. + if (!STI.getFeatureBits()[ARM::ModeThumb]) + return 4; + + // In a Thumb instruction stream, a halfword is a standalone 2-byte + // instruction if and only if its value is less than 0xE800. + // Otherwise, it's the first halfword of a 4-byte instruction. + // + // So, if we can see the upcoming halfword, we can judge on that + // basis, and maybe skip a whole 4-byte instruction that we don't + // know how to decode, without accidentally trying to interpret its + // second half as something else. + // + // If we don't have the instruction data available, we just have to + // recommend skipping the minimum sensible distance, which is 2 + // bytes. + if (Bytes.size() < 2) + return 2; + + uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0]; + return Insn16 < 0xE800 ? 2 : 4; +} + DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes, uint64_t Address, diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test @@ -0,0 +1,52 @@ +# RUN: yaml2obj %s | llvm-objdump -d --mcpu=cortex-a8 - | FileCheck %s + +# Test that unrecognized instructions are skipped in a way that makes +# sense for the Arm instruction set encoding. +# +# The first three instructions in this file are marked by the mapping +# symbols as in Arm state, with the one in the middle unknown, and we +# expect the disassembler to skip 4 bytes because that's the width of +# any Arm instruction. +# +# At address 0xc there's a mapping symbol that says we're now in Thumb +# mode, and in that mode we include both a 16-bit and a 32-bit unknown +# Thumb instruction, which the disassembler will identify by the simple +# encoding criterion that tells you the instruction length without +# having to recognize it specifically. +# +# Finally we end with a single byte, to ensure nothing gets confused +# when the Thumb instruction stream doesn't contain enough data to +# even do that check. + +# CHECK: 0: 64 00 a0 e3 mov r0, #100 +# CHECK-NEXT: 4: ff ff ff ff +# CHECK-NEXT: 8: 12 03 81 e0 add r0, r1, r2, lsl r3 + +# CHECK: c: 64 20 movs r0, #100 +# CHECK-NEXT: e: 0e b8 +# CHECK-NEXT: 10: 40 18 adds r0, r0, r1 +# CHECK-NEXT: 12: 4f f0 64 00 mov.w r0, #100 +# CHECK-NEXT: 16: ee ff cc dd +# CHECK-NEXT: 1a: 01 eb c2 00 add.w r0, r1, r2, lsl #3 +# CHECK-NEXT: 1e: 9a + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_ARM + Flags: [ EF_ARM_EABI_VER5 ] +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x4 + Content: 6400a0e3ffffffff120381e064200eb840184ff06400eeffccdd01ebc2009a +Symbols: + - Name: '$a' + Section: .text + - Name: '$t' + Section: .text + Value: 0x0c +... diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test --- a/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test +++ b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test @@ -13,8 +13,8 @@ ## llvm-objdump --mattr=+ext1 # CHECK: 00000000 <.text>: -# CHECK-NEXT: 0: cb -# CHECK-NEXT: 1: f3 f7 8b be b.w 0xffff3d1b <{{.+}}> @ imm = #-49898 +# CHECK-NEXT: 0: cb f3 f7 8b +# CHECK-NEXT: 4: be --- !ELF FileHeader: diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -1022,10 +1022,12 @@ // Disassemble a real instruction and record function-local branch labels. MCInst Inst; uint64_t Size; - bool Disassembled = DisAsm->getInstruction( - Inst, Size, Bytes.slice(Index - SectionAddr), Index, nulls()); + ArrayRef ThisBytes = Bytes.slice(Index - SectionAddr); + bool Disassembled = + DisAsm->getInstruction(Inst, Size, ThisBytes, Index, nulls()); if (Size == 0) - Size = 1; + Size = std::min(ThisBytes.size(), + DisAsm->suggestBytesToSkip(ThisBytes, Index)); if (Disassembled && MIA) { uint64_t Target; @@ -1068,10 +1070,11 @@ for (size_t Index = 0; Index != Bytes.size();) { MCInst Inst; uint64_t Size; - DisAsm->getInstruction(Inst, Size, Bytes.slice(Index), SectionAddr + Index, - nulls()); + ArrayRef ThisBytes = Bytes.slice(Index - SectionAddr); + DisAsm->getInstruction(Inst, Size, ThisBytes, Index, nulls()); if (Size == 0) - Size = 1; + Size = std::min(ThisBytes.size(), + DisAsm->suggestBytesToSkip(ThisBytes, Index)); Index += Size; } ArrayRef LabelAddrsRef = SymbolizerPtr->getReferencedAddresses(); @@ -1538,11 +1541,13 @@ // Disassemble a real instruction or a data when disassemble all is // provided MCInst Inst; - bool Disassembled = - DisAsm->getInstruction(Inst, Size, Bytes.slice(Index), - SectionAddr + Index, CommentStream); + ArrayRef ThisBytes = Bytes.slice(Index); + uint64_t ThisAddr = SectionAddr + Index; + bool Disassembled = DisAsm->getInstruction(Inst, Size, ThisBytes, + ThisAddr, CommentStream); if (Size == 0) - Size = 1; + Size = std::min(ThisBytes.size(), + DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr)); LVP.update({Index, Section.getIndex()}, {Index + Size, Section.getIndex()}, Index + Size != End); diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp --- a/llvm/tools/sancov/sancov.cpp +++ b/llvm/tools/sancov/sancov.cpp @@ -783,10 +783,12 @@ for (uint64_t Index = 0, Size = 0; Index < Section.getSize(); Index += Size) { MCInst Inst; - if (!DisAsm->getInstruction(Inst, Size, Bytes.slice(Index), - SectionAddr + Index, nulls())) { + ArrayRef ThisBytes = Bytes.slice(Index); + uint64_t ThisAddr = SectionAddr + Index; + if (!DisAsm->getInstruction(Inst, Size, ThisBytes, ThisAddr, nulls())) { if (Size == 0) - Size = 1; + Size = std::min(ThisBytes.size(), + DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr)); continue; } uint64_t Addr = Index + SectionAddr;