[WebAssembly] Initial Disassembler.

sbc100 · sbc100 · commit 16c16827bc7e · 2018-05-10T22:16:44.000Z
This implements a new table-gen emitter to create tables for a wasm disassembler, and a dissassembler to use them. Comes with 2 tests, that tests a few instructions manually. Is also able to disassemble large .wasm files with objdump reasonably. Not working so well, to be addressed in followups: - objdump appears to be passing an incorrect starting point. - since the disassembler works an instruction at a time, and it is disassembling stack instruction, it has no idea of pseudo register assignments. These registers are required for the instruction printing code that follows. For now, all such registers appear in the output as $0. Patch by Wouter van Oortmerssen Differential Revision: https://reviews.llvm.org/D45848 llvm-svn: 332052
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_TARGET_DEFINITIONS WebAssembly.td)
 tablegen(LLVM WebAssemblyGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM WebAssemblyGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM WebAssemblyGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM WebAssemblyGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -19,16 +19,23 @@
 #include "WebAssembly.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-disassembler"
 
+using DecodeStatus = MCDisassembler::DecodeStatus;
+
+#include "WebAssemblyGenDisassemblerTables.inc"
+
 namespace {
 class WebAssemblyDisassembler final : public MCDisassembler {
   std::unique_ptr<const MCInstrInfo> MCII;
@@ -60,11 +67,120 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
                                          createWebAssemblyDisassembler);
 }
 
-MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
-    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
-    raw_ostream &OS, raw_ostream &CS) const {
+static int nextByte(ArrayRef<uint8_t> Bytes, uint64_t &Size) {
+  if (Size >= Bytes.size())
+    return -1;
+  auto V = Bytes[Size];
+  Size++;
+  return V;
+}
 
-  // TODO: Implement disassembly.
+static bool parseLEBImmediate(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, bool Signed) {
+  unsigned N = 0;
+  const char *Error = nullptr;
+  auto Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
+                                    Bytes.data() + Bytes.size(), &Error)
+                    : static_cast<int64_t>(
+                          decodeULEB128(Bytes.data() + Size, &N,
+                                        Bytes.data() + Bytes.size(), &Error));
+  if (Error)
+    return false;
+  Size += N;
+  MI.addOperand(MCOperand::createImm(Val));
+  return true;
+}
+
+template <typename T>
+bool parseFPImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
+  if (Size + sizeof(T) > Bytes.size())
+    return false;
+  T Val;
+  memcpy(&Val, Bytes.data() + Size, sizeof(T));
+  support::endian::byte_swap<T, support::endianness::little>(Val);
+  Size += sizeof(T);
+  MI.addOperand(MCOperand::createFPImm(static_cast<double>(Val)));
+  return true;
+}
 
-  return MCDisassembler::Fail;
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+    raw_ostream & /*OS*/, raw_ostream &CS) const {
+  CommentStream = &CS;
+  Size = 0;
+  auto Opc = nextByte(Bytes, Size);
+  if (Opc < 0)
+    return MCDisassembler::Fail;
+  const auto *WasmInst = &InstructionTable0[Opc];
+  // If this is a prefix byte, indirect to another table.
+  if (WasmInst->ET == ET_Prefix) {
+    WasmInst = nullptr;
+    // Linear search, so far only 2 entries.
+    for (auto PT = PrefixTable; PT->Table; PT++) {
+      if (PT->Prefix == Opc) {
+        WasmInst = PT->Table;
+        break;
+      }
+    }
+    if (!WasmInst)
+      return MCDisassembler::Fail;
+    Opc = nextByte(Bytes, Size);
+    if (Opc < 0)
+      return MCDisassembler::Fail;
+    WasmInst += Opc;
+  }
+  if (WasmInst->ET == ET_Unused)
+    return MCDisassembler::Fail;
+  // At this point we must have a valid instruction to decode.
+  assert(WasmInst->ET == ET_Instruction);
+  MI.setOpcode(WasmInst->Opcode);
+  // Parse any operands.
+  for (uint8_t OPI = 0; OPI < WasmInst->NumOperands; OPI++) {
+    switch (WasmInst->Operands[OPI]) {
+    // ULEB operands:
+    case WebAssembly::OPERAND_BASIC_BLOCK:
+    case WebAssembly::OPERAND_LOCAL:
+    case WebAssembly::OPERAND_GLOBAL:
+    case WebAssembly::OPERAND_FUNCTION32:
+    case WebAssembly::OPERAND_OFFSET32:
+    case WebAssembly::OPERAND_P2ALIGN:
+    case WebAssembly::OPERAND_TYPEINDEX:
+    case MCOI::OPERAND_IMMEDIATE: {
+      if (!parseLEBImmediate(MI, Size, Bytes, false))
+        return MCDisassembler::Fail;
+      break;
+    }
+    // SLEB operands:
+    case WebAssembly::OPERAND_I32IMM:
+    case WebAssembly::OPERAND_I64IMM:
+    case WebAssembly::OPERAND_SIGNATURE: {
+      if (!parseLEBImmediate(MI, Size, Bytes, true))
+        return MCDisassembler::Fail;
+      break;
+    }
+    // FP operands.
+    case WebAssembly::OPERAND_F32IMM: {
+      if (!parseFPImmediate<float>(MI, Size, Bytes))
+        return MCDisassembler::Fail;
+      break;
+    }
+    case WebAssembly::OPERAND_F64IMM: {
+      if (!parseFPImmediate<double>(MI, Size, Bytes))
+        return MCDisassembler::Fail;
+      break;
+    }
+    case MCOI::OPERAND_REGISTER: {
+      // These are NOT actually in the instruction stream, but MC is going to
+      // expect operands to be present for them!
+      // FIXME: can MC re-generate register assignments or do we have to
+      // do this? Since this function decodes a single instruction, we don't
+      // have the proper context for tracking an operand stack here.
+      MI.addOperand(MCOperand::createReg(0));
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown operand type in WebAssemblyDisassembler");
+    }
+  }
+  return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -46,7 +46,7 @@ void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
 
 void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                        StringRef Annot,
-                                       const MCSubtargetInfo & /*STI*/) {
+                                       const MCSubtargetInfo &STI) {
   // Print the instruction (this uses the AsmStrings from the .td files).
   printInstruction(MI, OS);
 
@@ -194,20 +194,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void
-WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
-                                                       unsigned OpNo,
-                                                       raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(
+    const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   int64_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
     return;
   O << ":p2align=" << Imm;
 }
 
-void
-WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
-                                                         unsigned OpNo,
-                                                         raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(
+    const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   int64_t Imm = MI->getOperand(OpNo).getImm();
   switch (WebAssembly::ExprType(Imm)) {
   case WebAssembly::ExprType::Void: break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -82,7 +82,15 @@ def WebAssemblyAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
 }
 
+def WebAssemblyAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 0;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
 def WebAssembly : Target {
   let InstructionSet = WebAssemblyInstrInfo;
   let AssemblyParsers  = [WebAssemblyAsmParser];
+  let AssemblyWriters = [WebAssemblyAsmWriter];
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -57,6 +57,10 @@ def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
+// This is technically a control-flow instruction, since all it affects is the
+// IP.
+def NOP : I<(outs), (ins), [], "nop", 0x01>;
+
 // Placemarkers to indicate the start or end of a block or loop scope.
 // These use/clobber VALUE_STACK to prevent them from being moved into the
 // middle of an expression tree.
diff --git a/llvm/test/MC/Disassembler/WebAssembly/lit.local.cfg b/llvm/test/MC/Disassembler/WebAssembly/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'WebAssembly' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/MC/Disassembler/WebAssembly/wasm.txt b/llvm/test/MC/Disassembler/WebAssembly/wasm.txt
@@ -0,0 +1,33 @@
+# RUN: llvm-mc --disassemble %s -triple=wasm32-unknown-unknown | FileCheck %s
+
+# CHECK: .text
+
+# CHECK: nop
+0x01
+
+# CHECK: i32.add $0=, $0, $0
+# NOTE: registers are meaningless, as there is no context for what they are.
+0x6a
+
+# CHECK: i64.const $0=, -1
+0x42 0x7F
+
+# CHECK: i64.load32_u $0=, 16($0):p2align=1
+0x35 0x01 0x10
+
+# CHECK: block
+# 3
+# FIXME: WebAssemblyInstPrinter does not currently print block number.
+0x02 0x03
+
+# CHECK: call_indirect
+# $0=, 128, 0
+# FIXME: WebAssemblyInstPrinter does not print immediates.
+0x11 0x80 0x01 0x00
+
+# CHECK: get_local $0=, 128
+0x20 0x80 0x01
+
+# Prefix byte example:
+# CHECK: i64.trunc_u:sat/f64 $0=, $0
+0xFC 0x07
diff --git a/llvm/unittests/MC/Disassembler.cpp b/llvm/unittests/MC/Disassembler.cpp
@@ -21,7 +21,7 @@ static const char *symbolLookupCallback(void *DisInfo, uint64_t ReferenceValue,
   return nullptr;
 }
 
-TEST(Disassembler, Test1) {
+TEST(Disassembler, X86Test) {
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllDisassemblers();
@@ -62,3 +62,46 @@ TEST(Disassembler, Test1) {
 
   LLVMDisasmDispose(DCR);
 }
+
+TEST(Disassembler, WebAssemblyTest) {
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+
+  uint8_t Bytes[] = {0x6a, 0x42, 0x7F, 0x35, 0x01, 0x10};
+  uint8_t *BytesP = Bytes;
+  const char OutStringSize = 100;
+  char OutString[OutStringSize];
+  LLVMDisasmContextRef DCR = LLVMCreateDisasm(
+      "wasm32-unknown-unknown-elf", nullptr, 0, nullptr, symbolLookupCallback);
+  if (!DCR)
+    return;
+
+  size_t InstSize;
+  unsigned NumBytes = sizeof(Bytes);
+  unsigned PC = 0;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 1U);
+  EXPECT_EQ(StringRef(OutString), "\ti32.add \t$0=, $0, $0");
+  PC += InstSize;
+  BytesP += InstSize;
+  NumBytes -= InstSize;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 2U);
+  EXPECT_EQ(StringRef(OutString), "\ti64.const\t$0=, -1");
+
+  PC += InstSize;
+  BytesP += InstSize;
+  NumBytes -= InstSize;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 3U);
+  EXPECT_EQ(StringRef(OutString), "\ti64.load32_u\t$0=, 16($0):p2align=1");
+
+  LLVMDisasmDispose(DCR);
+}
diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt
@@ -44,6 +44,7 @@ add_tablegen(llvm-tblgen LLVM
   X86FoldTablesEmitter.cpp
   X86ModRMFilters.cpp
   X86RecognizableInstr.cpp
+  WebAssemblyDisassemblerEmitter.cpp
   CTagsEmitter.cpp
   )
 set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning")
diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
+#include "WebAssemblyDisassemblerEmitter.h"
 #include "X86DisassemblerTables.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/TableGen/Error.h"
@@ -125,6 +126,14 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
     return;
   }
 
+  // WebAssembly has variable length opcodes, so can't use EmitFixedLenDecoder
+  // below (which depends on a Size table-gen Record), and also uses a custom
+  // disassembler.
+  if (Target.getName() == "WebAssembly") {
+    emitWebAssemblyDisassemblerTables(OS, Target.getInstructionsByEnumValue());
+    return;
+  }
+
   // ARM and Thumb have a CHECK() macro to deal with DecodeStatuses.
   if (Target.getName() == "ARM" || Target.getName() == "Thumb" ||
       Target.getName() == "AArch64" || Target.getName() == "ARM64") {
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+if not 'WebAssembly' in config.root.targets:`
	`2`	`+ config.unsupported = True`
	`3`	`+`
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@ add_tablegen(llvm-tblgen LLVM`
`44`	`44`	`X86FoldTablesEmitter.cpp`
`45`	`45`	`X86ModRMFilters.cpp`
`46`	`46`	`X86RecognizableInstr.cpp`
	`47`	`+ WebAssemblyDisassemblerEmitter.cpp`
`47`	`48`	`CTagsEmitter.cpp`
`48`	`49`	`)`
`49`	`50`	`set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning")`
-Original file line number
+Diff line change
 +//===- WebAssemblyDisassemblerEmitter.cpp - Disassembler tables -*- C++ -*-===//
 +//
 +//                     The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This file is part of the WebAssembly Disassembler Emitter.
 +// It contains the implementation of the disassembler tables.
 +// Documentation for the disassembler emitter in general can be found in
 +// WebAssemblyDisassemblerEmitter.h.
 +//
 +//===----------------------------------------------------------------------===//
++
 +#include "WebAssemblyDisassemblerEmitter.h"
 +#include "llvm/TableGen/Record.h"
++
 +namespace llvm {
++
 +void emitWebAssemblyDisassemblerTables(
 +    raw_ostream &OS,
 +    const ArrayRef<const CodeGenInstruction *> &NumberedInstructions) {
 +  // First lets organize all opcodes by (prefix) byte. Prefix 0 is the
 +  // starting table.
 +  std::map<unsigned,
 +           std::map<unsigned, std::pair<unsigned, const CodeGenInstruction *>>>
 +      OpcodeTable;
 +  for (unsigned I = 0; I != NumberedInstructions.size(); ++I) {
 +    auto &CGI = *NumberedInstructions[I];
 +    auto &Def = *CGI.TheDef;
 +    if (!Def.getValue("Inst"))
 +      continue;
 +    auto &Inst = *Def.getValueAsBitsInit("Inst");
 +    auto Opc = static_cast<unsigned>(
 +        reinterpret_cast<IntInit *>(Inst.convertInitializerTo(IntRecTy::get()))
 +            ->getValue());
 +    if (Opc == 0xFFFFFFFF)
 +      continue; // No opcode defined.
 +    assert(Opc <= 0xFFFF);
 +    auto Prefix = Opc >> 8;
 +    Opc = Opc & 0xFF;
 +    auto &CGIP = OpcodeTable[Prefix][Opc];
 +    if (!CGIP.second ||
 +        // Make sure we store the variant with the least amount of operands,
 +        // which is the one without explicit registers. Only few instructions
 +        // have these currently, would be good to have for all of them.
 +        // FIXME: this picks the first of many typed variants, which is
 +        // currently the except_ref one, though this shouldn't matter for
 +        // disassembly purposes.
 +        CGIP.second->Operands.OperandList.size() >
 +            CGI.Operands.OperandList.size()) {
 +      CGIP = std::make_pair(I, &CGI);
 +    }
 +  }
 +  OS << "#include \"MCTargetDesc/WebAssemblyMCTargetDesc.h\"\n";
 +  OS << "\n";
 +  OS << "namespace llvm {\n\n";
 +  OS << "enum EntryType : uint8_t { ";
 +  OS << "ET_Unused, ET_Prefix, ET_Instruction };\n\n";
 +  OS << "struct WebAssemblyInstruction {\n";
 +  OS << "  uint16_t Opcode;\n";
 +  OS << "  EntryType ET;\n";
 +  OS << "  uint8_t NumOperands;\n";
 +  OS << "  uint8_t Operands[4];\n";
 +  OS << "};\n\n";
 +  // Output one table per prefix.
 +  for (auto &PrefixPair : OpcodeTable) {
 +    if (PrefixPair.second.empty())
 +      continue;
 +    OS << "WebAssemblyInstruction InstructionTable" << PrefixPair.first;
 +    OS << "[] = {\n";
 +    for (unsigned I = 0; I <= 0xFF; I++) {
 +      auto InstIt = PrefixPair.second.find(I);
 +      if (InstIt != PrefixPair.second.end()) {
 +        // Regular instruction.
 +        assert(InstIt->second.second);
 +        auto &CGI = *InstIt->second.second;
 +        OS << "  // 0x";
 +        OS.write_hex(static_cast<unsigned long long>(I));
 +        OS << ": " << CGI.AsmString << "\n";
 +        OS << "  { " << InstIt->second.first << ", ET_Instruction, ";
 +        OS << CGI.Operands.OperandList.size() << ", {\n";
 +        for (auto &Op : CGI.Operands.OperandList) {
 +          OS << "      " << Op.OperandType << ",\n";
 +        }
 +        OS << "    }\n";
 +      } else {
 +        auto PrefixIt = OpcodeTable.find(I);
 +        // If we have a non-empty table for it that's not 0, this is a prefix.
 +        if (PrefixIt != OpcodeTable.end() && I && !PrefixPair.first) {
 +          OS << "  { 0, ET_Prefix, 0, {}";
 +        } else {
 +          OS << "  { 0, ET_Unused, 0, {}";
 +        }
 +      }
 +      OS << "  },\n";
 +    }
 +    OS << "};\n\n";
 +  }
 +  // Create a table of all extension tables:
 +  OS << "struct { uint8_t Prefix; const WebAssemblyInstruction *Table; }\n";
 +  OS << "PrefixTable[] = {\n";
 +  for (auto &PrefixPair : OpcodeTable) {
 +    if (PrefixPair.second.empty() || !PrefixPair.first)
 +      continue;
 +    OS << "  { " << PrefixPair.first << ", InstructionTable"
 +       << PrefixPair.first;
 +    OS << " },\n";
 +  }
 +  OS << "  { 0, nullptr }\n};\n\n";
 +  OS << "} // End llvm namespace\n";
 +}
++
 +} // namespace llvm