Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -44,6 +44,12 @@ def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, Intrinsic<[], [llvm_i32_ty]>; def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, Intrinsic<[], [llvm_i32_ty]>; +// A space-consuming intrinsic primarily for testing block and jump table +// placements. The first argument is the number of bytes this "instruction" +// takes up, the second and return value are essentially chains, used to force +// ordering during ISel. +def int_aarch64_space : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], []>; + } //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -32,6 +32,7 @@ FunctionPass *createAArch64DeadRegisterDefinitions(); FunctionPass *createAArch64RedundantCopyEliminationPass(); FunctionPass *createAArch64CondBrTuning(); +FunctionPass *createAArch64CompressJumpTablesPass(); FunctionPass *createAArch64ConditionalCompares(); FunctionPass *createAArch64AdvSIMDScalar(); FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, @@ -62,6 +63,7 @@ void initializeAArch64BranchTargetsPass(PassRegistry&); void initializeAArch64CollectLOHPass(PassRegistry&); void initializeAArch64CondBrTuningPass(PassRegistry &); +void initializeAArch64CompressJumpTablesPass(PassRegistry&); void initializeAArch64ConditionalComparesPass(PassRegistry&); void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -180,6 +180,10 @@ "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; +def FeatureForce32BitJumpTables + : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true", + "Force jump table entries to be 32-bits wide except at MinSize">; + def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true", "Enable support for RCPC extension">; @@ -411,7 +415,8 @@ FeaturePostRAScheduler, FeatureSlowMisaligned128Store, FeatureUseRSqrt, - FeatureZCZeroingFP]>; + FeatureZCZeroingFP, + FeatureForce32BitJumpTables]>; def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M2 processors", @@ -425,7 +430,8 @@ FeaturePerfMon, FeaturePostRAScheduler, FeatureSlowMisaligned128Store, - FeatureZCZeroingFP]>; + FeatureZCZeroingFP, + FeatureForce32BitJumpTables]>; def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", @@ -442,7 +448,8 @@ FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroingFP]>; + FeatureZCZeroingFP, + FeatureForce32BitJumpTables]>; def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", [ Index: llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -31,6 +31,8 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -77,6 +79,12 @@ return MCInstLowering.lowerOperand(MO, MCOp); } + void EmitJumpTableInfo() override; + void emitJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, unsigned JTI); + + void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI); + void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI); void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, @@ -433,6 +441,104 @@ printOperand(MI, NOps - 2, OS); } +void AArch64AsmPrinter::EmitJumpTableInfo() { + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + if (!MJTI) return; + + const std::vector &JT = MJTI->getJumpTables(); + if (JT.empty()) return; + + const TargetLoweringObjectFile &TLOF = getObjFileLowering(); + MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM); + OutStreamer->SwitchSection(ReadOnlySec); + + auto AFI = MF->getInfo(); + for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) { + const std::vector &JTBBs = JT[JTI].MBBs; + + // If this jump table was deleted, ignore it. + if (JTBBs.empty()) continue; + + unsigned Size = AFI->getJumpTableEntrySize(JTI); + EmitAlignment(Log2_32(Size)); + OutStreamer->EmitLabel(GetJTISymbol(JTI)); + + for (auto *JTBB : JTBBs) + emitJumpTableEntry(MJTI, JTBB, JTI); + } +} + +void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned JTI) { + const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); + auto AFI = MF->getInfo(); + unsigned Size = AFI->getJumpTableEntrySize(JTI); + + if (Size == 4) { + // .word LBB - LJTI + const TargetLowering *TLI = MF->getSubtarget().getTargetLowering(); + const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext); + Value = MCBinaryExpr::createSub(Value, Base, OutContext); + } else { + // .byte (LBB - LBB) >> 2 (or .hword) + const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI); + const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext); + Value = MCBinaryExpr::createSub(Value, Base, OutContext); + Value = MCBinaryExpr::createLShr( + Value, MCConstantExpr::create(2, OutContext), OutContext); + } + + OutStreamer->EmitValue(Value, Size); +} + +/// Small jump tables contain an unsigned byte or half, representing the offset +/// from the lowest-addressed possible destination to the desired basic +/// block. Since all instructions are 4-byte aligned, this is further compressed +/// by counting in instructions rather than bytes (i.e. divided by 4). So, to +/// materialize the correct destination we need: +/// +/// adr xDest, .LBB0_0 +/// ldrb wScratch, [xTable, xEntry] (with "lsl #1" for ldrh). +/// add xDest, xDest, xScratch, lsl #2 +void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer, + const llvm::MachineInstr &MI) { + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned ScratchReg = MI.getOperand(1).getReg(); + unsigned ScratchRegW = + STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32); + unsigned TableReg = MI.getOperand(2).getReg(); + unsigned EntryReg = MI.getOperand(3).getReg(); + int JTIdx = MI.getOperand(4).getIndex(); + bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8; + + // This has to be first because the compression pass based its reachability + // calculations on the start of the JumpTableDest instruction. + auto Label = + MF->getInfo()->getJumpTableEntryPCRelSymbol(JTIdx); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR) + .addReg(DestReg) + .addExpr(MCSymbolRefExpr::create( + Label, MF->getContext()))); + + // Load the number of instruction-steps to offset from the label. + unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX; + EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode) + .addReg(ScratchRegW) + .addReg(TableReg) + .addReg(EntryReg) + .addImm(0) + .addImm(IsByteEntry ? 0 : 1)); + + // Multiply the steps by 4 and add to the already materialized base label + // address. + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs) + .addReg(DestReg) + .addReg(DestReg) + .addReg(ScratchReg) + .addImm(2)); +} + void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes(); @@ -662,6 +768,32 @@ return; } + case AArch64::JumpTableDest32: { + // We want: + // ldrsw xScratch, [xTable, xEntry, lsl #2] + // add xDest, xTable, xScratch + unsigned DestReg = MI->getOperand(0).getReg(), + ScratchReg = MI->getOperand(1).getReg(), + TableReg = MI->getOperand(2).getReg(), + EntryReg = MI->getOperand(3).getReg(); + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX) + .addReg(ScratchReg) + .addReg(TableReg) + .addReg(EntryReg) + .addImm(0) + .addImm(1)); + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs) + .addReg(DestReg) + .addReg(TableReg) + .addReg(ScratchReg) + .addImm(0)); + return; + } + case AArch64::JumpTableDest16: + case AArch64::JumpTableDest8: + LowerJumpTableDestSmall(*OutStreamer, *MI); + return; + case AArch64::FMOVH0: case AArch64::FMOVS0: case AArch64::FMOVD0: Index: llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -0,0 +1,162 @@ +//==-- AArch64CompressJumpTables.cpp - Compress jump tables for AArch64 --====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// This pass looks at the basic blocks each jump-table refers to and works out +// whether they can be emitted in a compressed form (with 8 or 16-bit +// entries). If so, it changes the opcode and flags them in the associated +// AArch64FunctionInfo. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-jump-tables" + +STATISTIC(NumJT8, "Number of jump-tables with 1-byte entries"); +STATISTIC(NumJT16, "Number of jump-tables with 2-byte entries"); +STATISTIC(NumJT32, "Number of jump-tables with 4-byte entries"); + +namespace { +class AArch64CompressJumpTables : public MachineFunctionPass { + const TargetInstrInfo *TII; + MachineFunction *MF; + SmallVector BlockInfo; + + int computeBlockSize(MachineBasicBlock &MBB); + void scanFunction(); + + bool compressJumpTable(MachineInstr &MI, int Offset); + +public: + static char ID; + AArch64CompressJumpTables() : MachineFunctionPass(ID) { + initializeAArch64CompressJumpTablesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + StringRef getPassName() const override { + return "AArch64 Compress Jump Tables"; + } +}; +char AArch64CompressJumpTables::ID = 0; +} + +INITIALIZE_PASS(AArch64CompressJumpTables, DEBUG_TYPE, + "AArch64 compress jump tables pass", false, false) + +int AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) { + int Size = 0; + for (const MachineInstr &MI : MBB) + Size += TII->getInstSizeInBytes(MI); + return Size; +} + +void AArch64CompressJumpTables::scanFunction() { + BlockInfo.clear(); + BlockInfo.resize(MF->getNumBlockIDs()); + + int Offset = 0; + for (MachineBasicBlock &MBB : *MF) { + BlockInfo[MBB.getNumber()] = Offset; + Offset += computeBlockSize(MBB); + } +} + +bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI, + int Offset) { + if (MI.getOpcode() != AArch64::JumpTableDest32) + return false; + + int JTIdx = MI.getOperand(4).getIndex(); + auto &JTInfo = *MF->getJumpTableInfo(); + const MachineJumpTableEntry &JT = JTInfo.getJumpTables()[JTIdx]; + + // The jump-table might have been optimized away. + if (JT.MBBs.empty()) + return false; + + int MaxOffset = std::numeric_limits::min(), + MinOffset = std::numeric_limits::max(); + MachineBasicBlock *MinBlock = nullptr; + for (auto Block : JT.MBBs) { + int BlockOffset = BlockInfo[Block->getNumber()]; + assert(BlockOffset % 4 == 0 && "misaligned basic block"); + + MaxOffset = std::max(MaxOffset, BlockOffset); + if (BlockOffset <= MinOffset) { + MinOffset = BlockOffset; + MinBlock = Block; + } + } + + // The ADR instruction needed to calculate the address of the first reachable + // basic block can address +/-1MB. + if (!isInt<21>(MinOffset - Offset)) { + ++NumJT32; + return false; + } + + int Span = MaxOffset - MinOffset; + auto AFI = MF->getInfo(); + if (isUInt<8>(Span / 4)) { + AFI->setJumpTableEntryInfo(JTIdx, 1, MinBlock->getSymbol()); + MI.setDesc(TII->get(AArch64::JumpTableDest8)); + ++NumJT8; + return true; + } else if (isUInt<16>(Span / 4)) { + AFI->setJumpTableEntryInfo(JTIdx, 2, MinBlock->getSymbol()); + MI.setDesc(TII->get(AArch64::JumpTableDest16)); + ++NumJT16; + return true; + } + + ++NumJT32; + return false; +} + +bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) { + bool Changed = false; + MF = &MFIn; + + const auto &ST = MF->getSubtarget(); + TII = ST.getInstrInfo(); + + if (ST.force32BitJumpTables() && !MF->getFunction().optForMinSize()) + return false; + + scanFunction(); + + for (MachineBasicBlock &MBB : *MF) { + int Offset = BlockInfo[MBB.getNumber()]; + for (MachineInstr &MI : MBB) { + Changed |= compressJumpTable(MI, Offset); + Offset += TII->getInstSizeInBytes(MI); + } + } + + return Changed; +} + +FunctionPass *llvm::createAArch64CompressJumpTablesPass() { + return new AArch64CompressJumpTables(); +} Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -607,6 +607,7 @@ SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -187,7 +187,7 @@ setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BR_JT, MVT::Other, Custom); setOperationAction(ISD::JumpTable, MVT::i64, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); @@ -2763,6 +2763,8 @@ return LowerSELECT_CC(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::BR_JT: + return LowerBR_JT(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: @@ -4816,6 +4818,22 @@ return getAddr(JT, DAG); } +SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, + SelectionDAG &DAG) const { + // Jump table entries as PC relative offsets. No additional tweaking + // is necessary here. Just get the address of the jump table. + SDLoc DL(Op); + SDValue JT = Op.getOperand(1); + SDValue Entry = Op.getOperand(2); + int JTI = cast(JT.getNode())->getIndex(); + + SDNode *Dest = + DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, + Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); + return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), + SDValue(Dest, 0)); +} + SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -108,6 +108,14 @@ // This gets lowered to an instruction sequence which takes 16 bytes NumBytes = 16; break; + case AArch64::JumpTableDest32: + case AArch64::JumpTableDest16: + case AArch64::JumpTableDest8: + NumBytes = 12; + break; + case AArch64::SPACE: + NumBytes = MI.getOperand(1).getImm(); + break; } return NumBytes; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -448,6 +448,30 @@ def : Pat<(AArch64LOADgot tconstpool:$addr), (LOADgot tconstpool:$addr)>; +// 32-bit jump table destination is actually only 2 instructions since we can +// use the table itself as a PC-relative base. But optimization occurs after +// branch relaxation so be pessimistic. +let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in { +def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), + (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, + Sched<[]>; +def JumpTableDest16 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), + (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, + Sched<[]>; +def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), + (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, + Sched<[]>; +} + +// Space-consuming pseudo to aid testing of placement and reachability +// algorithms. Immediate operand is the number of bytes this "instruction" +// occupies; register operands can be used to enforce dependency and constrain +// the scheduler. +let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in +def SPACE : Pseudo<(outs GPR64:$Rd), (ins i32imm:$size, GPR64:$Rn), + [(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>, + Sched<[]>; + //===----------------------------------------------------------------------===// // System instructions. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -162,6 +162,19 @@ unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } + unsigned getJumpTableEntrySize(int Idx) const { + auto It = JumpTableEntryInfo.find(Idx); + if (It != JumpTableEntryInfo.end()) + return It->second.first; + return 4; + } + MCSymbol *getJumpTableEntryPCRelSymbol(int Idx) const { + return JumpTableEntryInfo.find(Idx)->second.second; + } + void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym) { + JumpTableEntryInfo[Idx] = std::make_pair(Size, PCRelSym); + } + using SetOfInstructions = SmallPtrSet; const SetOfInstructions &getLOHRelated() const { return LOHRelated; } @@ -200,6 +213,8 @@ // Hold the lists of LOHs. MILOHContainer LOHContainerSet; SetOfInstructions LOHRelated; + + DenseMap> JumpTableEntryInfo; }; } // end namespace llvm Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -142,6 +142,7 @@ bool HasFuseLiterals = false; bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; + bool Force32BitJumpTables = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -292,6 +293,7 @@ } bool useRSqrt() const { return UseRSqrt; } + bool force32BitJumpTables() const { return Force32BitJumpTables; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -123,6 +123,10 @@ BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true), cl::desc("Relax out of range conditional branches")); +static cl::opt EnableCompressJumpTables( + "aarch64-enable-compress-jump-tables", cl::Hidden, cl::init(true), + cl::desc("Use smallest entry possible for jump tables")); + // FIXME: Unify control over GlobalMerge. static cl::opt EnableGlobalMerge("aarch64-enable-global-merge", cl::Hidden, @@ -158,6 +162,7 @@ initializeAArch64AdvSIMDScalarPass(*PR); initializeAArch64BranchTargetsPass(*PR); initializeAArch64CollectLOHPass(*PR); + initializeAArch64CompressJumpTablesPass(*PR); initializeAArch64ConditionalComparesPass(*PR); initializeAArch64ConditionOptimizerPass(*PR); initializeAArch64DeadRegisterDefinitionsPass(*PR); @@ -546,6 +551,9 @@ if (EnableBranchTargets) addPass(createAArch64BranchTargetsPass()); + if (TM->getOptLevel() != CodeGenOpt::None && EnableCompressJumpTables) + addPass(createAArch64CompressJumpTablesPass()); + if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && TM->getTargetTriple().isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -34,6 +34,7 @@ AArch64FastISel.cpp AArch64A53Fix835769.cpp AArch64FrameLowering.cpp + AArch64CompressJumpTables.cpp AArch64ConditionOptimizer.cpp AArch64RedundantCopyElimination.cpp AArch64ISelDAGToDAG.cpp Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -151,6 +151,7 @@ ; CHECK-NEXT: Branch Probability Basic Block Placement ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: AArch64 Branch Targets +; CHECK-NEXT: AArch64 Compress Jump Tables ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis Index: llvm/test/CodeGen/AArch64/jump-table-compress.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/jump-table-compress.mir @@ -0,0 +1,111 @@ +# RUN: llc -mtriple=aarch64-linux-gnu %s -run-pass=aarch64-jump-tables -o - | FileCheck %s +--- | + define i32 @test_jumptable(i32 %in) { + unreachable + } + +... +--- +name: test_jumptable +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +jumpTable: + kind: block-address + entries: + - id: 0 + blocks: [ '%bb.2', '%bb.3' ] + - id: 1 + blocks: [ '%bb.4', '%bb.5' ] + - id: 2 + blocks: [ '%bb.7' ] + - id: 3 + blocks: [ '%bb.9' ] + - id: 4 + blocks: [ '%bb.9' ] + - id: 5 + blocks: [ '%bb.11' ] +body: | + bb.0 (%ir-block.0): + + bb.1 (%ir-block.0): + ; CHECK-LABEL: body: + ; CHECK-LABEL: bb.1 + ; CHECK: JumpTableDest8 + liveins: $x8 + early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.0 + BR killed $x10 + + bb.2: + ; Last destination is 4 * 255 = 1020 bytes after first. Byte is OK. + dead $xzr = SPACE 1020, undef $xzr + + bb.3: + ; CHECK-LABEL: bb.3 + ; CHECK: JumpTableDest16 + early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.1 + BR killed $x10 + + bb.4: + ; Last destination is 4 * 256 = 1024 bytes after first. Half needed. + dead $xzr = SPACE 1024, undef $xzr + + bb.5: + ; CHECK-LABEL: bb.5 + ; CHECK: JumpTableDest8 + early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.2 + BR killed $x10 + + bb.6: + ; First destination is (2^20 - 4) after reference. Just reachable by ADR so can use compressed table. + dead $xzr = SPACE 1048556, undef $xzr + + bb.7: + ; CHECK-LABEL: bb.7 + ; CHECK: JumpTableDest32 + early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.3 + BR killed $x10 + + bb.8: + ; First destination is 2^20 after reference. Compressed table cannot reach it. + dead $xzr = SPACE 1048560, undef $xzr + + bb.9: + ; First destination is 2^20 before reference. Just within reach of ADR. + dead $xzr = SPACE 1048576, undef $xzr + + bb.10: + ; CHECK-LABEL: bb.10 + ; CHECK: JumpTableDest8 + early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.4 + BR killed $x10 + + bb.11: + ; First destination is 2^20 before reference. Just within reach of ADR. + dead $xzr = SPACE 1048580, undef $xzr + + bb.12: + ; CHECK-LABEL: bb.12 + ; CHECK: JumpTableDest32 + early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.5 + BR killed $x10 +... Index: llvm/test/CodeGen/AArch64/jump-table-exynos.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/jump-table-exynos.ll @@ -0,0 +1,67 @@ +; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mattr=+force-32bit-jump-tables -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s +; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m1 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s +; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m2 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s +; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s + +; Exynos doesn't want jump tables to be compressed for now. + +define i32 @test_jumptable(i32 %in) { + switch i32 %in, label %def [ + i32 0, label %lbl1 + i32 1, label %lbl2 + i32 2, label %lbl3 + i32 4, label %lbl4 + ] +; CHECK-LABEL: test_jumptable: +; CHECK-NOT: ldrb + +def: + ret i32 0 + +lbl1: + ret i32 1 + +lbl2: + ret i32 2 + +lbl3: + ret i32 4 + +lbl4: + ret i32 8 + +} + +define i32 @test_jumptable_minsize(i32 %in) minsize { + switch i32 %in, label %def [ + i32 0, label %lbl1 + i32 1, label %lbl2 + i32 2, label %lbl3 + i32 4, label %lbl4 + ] +; CHECK-LABEL: test_jumptable_minsize: +; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI1_0 +; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI1_0 +; CHECK: adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]] +; CHECK: ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}] +; CHECK: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK: br [[DEST]] + + + +def: + ret i32 0 + +lbl1: + ret i32 1 + +lbl2: + ret i32 2 + +lbl3: + ret i32 4 + +lbl4: + ret i32 8 + +} Index: llvm/test/CodeGen/AArch64/jump-table.ll =================================================================== --- llvm/test/CodeGen/AArch64/jump-table.ll +++ llvm/test/CodeGen/AArch64/jump-table.ll @@ -1,7 +1,7 @@ -; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s -; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s -; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s -; RUN: llc -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s +; RUN: llc -no-integrated-as -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s +; RUN: llc -no-integrated-as -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s +; RUN: llc -no-integrated-as -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s +; RUN: llc -no-integrated-as -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s define i32 @test_jumptable(i32 %in) { ; CHECK: test_jumptable @@ -12,27 +12,45 @@ i32 2, label %lbl3 i32 4, label %lbl4 ] -; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0 -; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0 -; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3] -; CHECK: br [[DEST]] - -; CHECK-LARGE: movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0 -; CHECK-LARGE: movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0 -; CHECK-LARGE: movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0 -; CHECK-LARGE: movk x[[JTADDR]], #:abs_g3:.LJTI0_0 -; CHECK-LARGE: ldr [[DEST:x[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}, lsl #3] -; CHECK-LARGE: br [[DEST]] - -; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0 -; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0 -; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2] -; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]] -; CHECK-PIC: br [[TABLE]] - -; CHECK-TINY: adr x[[JT:[0-9]+]], .LJTI0_0 -; CHECK-TINY: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3] -; CHECK-TINY: br [[DEST]] +; CHECK-LABEL: test_jumptable: +; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0 +; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0 +; CHECK: adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]] +; CHECK: ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}] +; CHECK: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK: br [[DEST]] + +; CHECK-LARGE: movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0 +; CHECK-LARGE: movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0 +; CHECK-LARGE: movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0 +; CHECK-LARGE: movk x[[JTADDR]], #:abs_g3:.LJTI0_0 +; CHECK-LARGE: adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]] +; CHECK-LARGE: ldrb w[[OFFSET:[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}] +; CHECK-LARGE: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK-LARGE: br [[DEST]] + +; CHECK-PIC-LABEL: test_jumptable: +; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0 +; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0 +; CHECK-PIC: adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]] +; CHECK-PIC: ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}] +; CHECK-PIC: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK-PIC: br [[DEST]] + +; CHECK-IOS: adrp [[JTPAGE:x[0-9]+]], LJTI0_0@PAGE +; CHECK-IOS: add x[[JT:[0-9]+]], [[JTPAGE]], LJTI0_0@PAGEOFF +; CHECK-IOS: adr [[PCBASE:x[0-9]+]], [[JTBASE:LBB[0-9]+_[0-9]+]] +; CHECK-IOS: ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}] +; CHECK-IOS: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK-IOS: br [[DEST]] + +; CHECK-TINY-LABEL: test_jumptable: +; CHECK-TINY: adr x[[JT:[0-9]+]], .LJTI0_0 +; CHECK-TINY: adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]] +; CHECK-TINY: ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}] +; CHECK-TINY: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK-TINY: br [[DEST]] + def: ret i32 0 @@ -54,18 +72,86 @@ ; CHECK: .rodata ; CHECK: .LJTI0_0: -; CHECK-NEXT: .xword -; CHECK-NEXT: .xword -; CHECK-NEXT: .xword -; CHECK-NEXT: .xword -; CHECK-NEXT: .xword +; CHECK-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2 +; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 + +define i32 @test_jumptable16(i32 %in) { + + switch i32 %in, label %def [ + i32 0, label %lbl1 + i32 1, label %lbl2 + i32 2, label %lbl3 + i32 4, label %lbl4 + ] +; CHECK-LABEL: test_jumptable16: +; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI1_0 +; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI1_0 +; CHECK: adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]] +; CHECK: ldrh w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #1] +; CHECK: add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2 +; CHECK: br [[DEST]] + +def: + ret i32 0 + +lbl1: + ret i32 1 + +lbl2: + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""() + ret i32 2 + +lbl3: + ret i32 4 + +lbl4: + ret i32 8 + +} + +; CHECK: .rodata +; CHECK: .p2align 1 +; CHECK: .LJTI1_0: +; CHECK-NEXT: .hword ([[JTBASE]]-[[JTBASE]])>>2 +; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2 ; CHECK-PIC-NOT: .data_region ; CHECK-PIC-NOT: .LJTI0_0 ; CHECK-PIC: .LJTI0_0: -; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0 -; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0 -; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0 -; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0 -; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0 +; CHECK-PIC-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2 +; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2 ; CHECK-PIC-NOT: .end_data_region + +; CHECK-IOS: .section __TEXT,__const +; CHECK-IOS-NOT: .data_region +; CHECK-IOS: LJTI0_0: +; CHECK-IOS-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2 +; CHECK-IOS-NEXT: .byte (LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-IOS-NEXT: .byte (LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-IOS-NEXT: .byte (LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-IOS-NEXT: .byte (LBB{{.*}}-[[JTBASE]])>>2 +; CHECK-IOS-NOT: .end_data_region Index: llvm/test/CodeGen/AArch64/min-jump-table.ll =================================================================== --- llvm/test/CodeGen/AArch64/min-jump-table.ll +++ llvm/test/CodeGen/AArch64/min-jump-table.ll @@ -14,8 +14,8 @@ ; CHECK0-NEXT: Jump Tables: ; CHECK0-NEXT: %jump-table.0: ; CHECK0-NOT: %jump-table.1: -; CHECK4-NOT: Jump Tables: -; CHECK8-NOT: Jump Tables: +; CHECK4-NOT: {{^}}Jump Tables: +; CHECK8-NOT: {{^}}Jump Tables: bb1: tail call void @ext(i32 0) br label %return bb2: tail call void @ext(i32 2) br label %return @@ -38,7 +38,7 @@ ; CHECK4-NEXT: Jump Tables: ; CHECK4-NEXT: %jump-table.0: ; CHECK4-NOT: %jump-table.1: -; CHECK8-NOT: Jump Tables: +; CHECK8-NOT: {{^}}Jump Tables: bb1: tail call void @ext(i32 0) br label %return bb2: tail call void @ext(i32 2) br label %return