Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -36,6 +36,7 @@ X86FixupSetCC.cpp X86FlagsCopyLowering.cpp X86FloatingPoint.cpp + X86FoldXBBExtLoad.cpp X86FrameLowering.cpp X86InstructionSelector.cpp X86ISelDAGToDAG.cpp Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -127,6 +127,9 @@ void initializeEvexToVexInstPassPass(PassRegistry &); +FunctionPass *createX86FoldXBBExtLoad(); + +void initializeFoldXBBExtLoadPassPass(PassRegistry &); } // End llvm namespace #endif Index: lib/Target/X86/X86FoldXBBExtLoad.cpp =================================================================== --- lib/Target/X86/X86FoldXBBExtLoad.cpp +++ lib/Target/X86/X86FoldXBBExtLoad.cpp @@ -0,0 +1,204 @@ +//===-- X86FoldXBBExtLoad.cpp - Fold cross BB ext/load instructions --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===----------------------------------------------------------------------===// +/// \file +/// This file defines the pass that finds 16bit sign/zero extensions that can be +/// folded into previous loads, may cross multiple basic blocks. This problem is +/// created by function X86TargetLowering::EmitCmp, in order to avoid 16 bit +/// immediates, this function intentionally creats an extension. If the value +/// comes from memory, then the extension can be folded into the load. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +using namespace llvm; + +#define FOLDXBBEXTLOAD_DESC "X86 Cross BB ZExt/SExt Load Folding" +#define FOLDXBBEXTLOAD_NAME "x86-fold-xbb-ext-load" + +#define DEBUG_TYPE FOLDXBBEXTLOAD_NAME + +// Option to allow this optimization pass to have fine-grained control. +static cl::opt + FoldXBBExtLoad("fold-xbb-ext-load", + cl::desc("Fold cross basic block sext/zext load instructions"), + cl::init(true), cl::Hidden); + +namespace { +class FoldXBBExtLoadPass : public MachineFunctionPass { + // This function finds the foldable instruction pattern and do the + // transformation if possible. + // MI is a 16b to 32b extension instruction. + bool tryFoldInst(MachineInstr *MI); + +public: + static char ID; + + StringRef getPassName() const override { return FOLDXBBEXTLOAD_DESC; } + + FoldXBBExtLoadPass() : MachineFunctionPass(ID) { + initializeFoldXBBExtLoadPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } + +private: + MachineFunction *MF; + MachineRegisterInfo* MRI; +}; + +char FoldXBBExtLoadPass::ID = 0; +} + +INITIALIZE_PASS(FoldXBBExtLoadPass, FOLDXBBEXTLOAD_NAME, FOLDXBBEXTLOAD_DESC, + false, false) + +FunctionPass *llvm::createX86FoldXBBExtLoad() { + return new FoldXBBExtLoadPass(); +} + +bool FoldXBBExtLoadPass::runOnMachineFunction(MachineFunction &MF) { + if (!FoldXBBExtLoad || skipFunction(MF.getFunction())) + return false; + + this->MF = &MF; + MRI = &MF.getRegInfo(); + + // To be deleted extension instructions. + SmallVector MIDelete; + + LLVM_DEBUG(dbgs() << "Start X86FoldXBBExtLoad\n";); + + for (auto &MBB : MF) + for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { + MachineInstr *MI = &*I; + if (MI->getOpcode() == X86::MOVZX32rr16 || + MI->getOpcode() == X86::MOVSX32rr16) + if (tryFoldInst(MI)) + MIDelete.push_back(MI); + } + + while (!MIDelete.empty()) { + MachineInstr *MI = MIDelete.pop_back_val(); + MachineBasicBlock *MBB = MI->getParent(); + MBB->erase(MI); + } + + LLVM_DEBUG(dbgs() << "End X86FoldXBBExtLoad\n";); + + return true; +} + +// This function actually does the pattern matching and transformation. +// It starts from ExtMI, which is a 16b to 32b extension instruction. +bool FoldXBBExtLoadPass::tryFoldInst(MachineInstr *ExtMI) { + // All involved 16b virtual registers. + SmallSetVector AllRegs; + // Work list. + SmallVector RegList; + + if (TargetRegisterInfo::isPhysicalRegister(ExtMI->getOperand(0).getReg())) + return false; + + unsigned Reg = ExtMI->getOperand(1).getReg(); + RegList.push_back(Reg); + AllRegs.insert(Reg); + + // Check if all defs of Reg is 16bit load. + while (!RegList.empty()) { + auto RegNo = RegList.pop_back_val(); + if (TargetRegisterInfo::isPhysicalRegister(RegNo)) + return false; + + auto *MI = MRI->getVRegDef(RegNo); + switch (MI->getOpcode()) { + case X86::PHI: + for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) + if (MI->getOperand(i).isReg()) { + auto NewReg = MI->getOperand(i).getReg(); + if (AllRegs.insert(NewReg)) + RegList.push_back(NewReg); + } + break; + + case X86::MOV16rm: + break; + + default: + // Ext can't be folded into other instructions. + return false; + } + } + + // Now we can change all 16b load instructions to load and extension. + const X86InstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetLowering *TLI = + MF->getSubtarget().getTargetLowering(); + const TargetRegisterClass *NewClass = TLI->getRegClassFor(MVT::i32); + for (auto RegNo : AllRegs) { + // Change the register class to 32bit. + MRI->setRegClass(RegNo, NewClass); + + // If the def instruction is 16bit load, change it to ext load. + // If the def is PHI, do nothing, change the register class is enough. + auto *DefMI = MRI->getVRegDef(RegNo); + if (DefMI->getOpcode() == X86::MOV16rm) { + unsigned NewOpcode = (ExtMI->getOpcode() == X86::MOVZX32rr16) ? + X86::MOVZX32rm16 : X86::MOVSX32rm16; + MachineInstrBuilder MIB = + BuildMI(*MF, DefMI->getDebugLoc(), TII->get(NewOpcode), RegNo); + + unsigned NumArgs = DefMI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) + MIB.add(DefMI->getOperand(i)); + MIB->setMemRefs(DefMI->memoperands_begin(), DefMI->memoperands_end()); + + MachineBasicBlock *MBB = DefMI->getParent(); + MBB->insert(DefMI, MIB); + MBB->erase(DefMI); + } + + // Change register uses to subreg. + for (auto &MO : MRI->use_operands(RegNo)) { + auto *MI = MO.getParent(); + // Do nothing for PHI instructions. + if (MI->getOpcode() == X86::PHI && + AllRegs.count(MI->getOperand(0).getReg())) + continue; + + // Change other uses to sub reg. + MO.setSubReg(X86::sub_16bit); + } + } + + // Now we can safely replace users of ExtMI to its src register. + SmallVector MOUsers; + unsigned OldReg = ExtMI->getOperand(0).getReg(); + unsigned NewReg = ExtMI->getOperand(1).getReg(); + for (auto &ExtMO : MRI->use_operands(OldReg)) + MOUsers.push_back(&ExtMO); + while (!MOUsers.empty()) { + MachineOperand *MO = MOUsers.pop_back_val(); + MO->setReg(NewReg); + } + // We don't know which live range ends first, so clear kill flags of both + // registers. + MRI->clearKillFlags(OldReg); + MRI->clearKillFlags(NewReg); + + return true; +} + Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -86,6 +86,7 @@ initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); + initializeFoldXBBExtLoadPassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -465,6 +466,7 @@ void X86PassConfig::addMachineSSAOptimization() { addPass(createX86DomainReassignmentPass()); TargetPassConfig::addMachineSSAOptimization(); + addPass(createX86FoldXBBExtLoad()); } void X86PassConfig::addPostRegAlloc() { Index: test/CodeGen/X86/O3-pipeline.ll =================================================================== --- test/CodeGen/X86/O3-pipeline.ll +++ test/CodeGen/X86/O3-pipeline.ll @@ -85,6 +85,7 @@ ; CHECK-NEXT: Machine code sinking ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions +; CHECK-NEXT: X86 Cross BB ZExt/SExt Load Folding ; CHECK-NEXT: Live Range Shrink ; CHECK-NEXT: X86 Fixup SetCC ; CHECK-NEXT: X86 LEA Optimize Index: test/CodeGen/X86/bmi-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/bmi-intrinsics-fast-isel.ll +++ test/CodeGen/X86/bmi-intrinsics-fast-isel.ll @@ -12,8 +12,7 @@ ; X32-LABEL: test__tzcnt_u16: ; X32: # %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movzwl %ax, %ecx -; X32-NEXT: cmpl $0, %ecx +; X32-NEXT: cmpl $0, %eax ; X32-NEXT: jne .LBB0_1 ; X32-NEXT: # %bb.2: ; X32-NEXT: movw $16, %ax @@ -164,8 +163,7 @@ ; X32-LABEL: test_tzcnt_u16: ; X32: # %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movzwl %ax, %ecx -; X32-NEXT: cmpl $0, %ecx +; X32-NEXT: cmpl $0, %eax ; X32-NEXT: jne .LBB7_1 ; X32-NEXT: # %bb.2: ; X32-NEXT: movw $16, %ax Index: test/CodeGen/X86/fold-xbb-ext-load.ll =================================================================== --- test/CodeGen/X86/fold-xbb-ext-load.ll +++ test/CodeGen/X86/fold-xbb-ext-load.ll @@ -0,0 +1,58 @@ +; RUN: llc < %s -mtriple=i686-unknown-unknown -tail-dup-placement=false | FileCheck %s + +declare void @v1() +declare void @v2() +declare void @v3() +@a = external global [10 x i16] + +define void @foo(i32 %cond) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: # %if.then +; CHECK-NEXT: movzwl a, %esi +; CHECK-NEXT: calll v1 +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_2: # %if.else +; CHECK-NEXT: movzwl a+2, %esi +; CHECK-NEXT: calll v2 +; CHECK-NEXT: .LBB0_3: # %if.end +; CHECK-NEXT: cmpl $4, %esi +; CHECK-NEXT: jb .LBB0_5 +; CHECK-NEXT: # %bb.4: # %if.then1 +; CHECK-NEXT: calll v3 +; CHECK-NEXT: .LBB0_5: # %if.end2 +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl +entry: + %tobool = icmp eq i32 %cond, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + %0 = load i16, i16* getelementptr ([10 x i16], [10 x i16]* @a, i64 0, i64 0) + call void @v1() + br label %if.end + +if.else: + %1 = load i16, i16* getelementptr ([10 x i16], [10 x i16]* @a, i64 0, i64 1) + call void @v2() + br label %if.end + +if.end: + %2 = phi i16 [ %0, %if.then ], [ %1, %if.else ] + %cmp = icmp ugt i16 %2, 3 + br i1 %cmp, label %if.then1, label %if.end2 + +if.then1: + call void @v3() + br label %if.end2 + +if.end2: + ret void +} + Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -236,7 +236,6 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD ; CHECK-NEXT: shrl $19, %ecx ; CHECK-NEXT: addl %ecx, %ecx