Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2806,6 +2806,12 @@ return false; } + /// Return true if it is desirable to speculatively load the operands + /// of a select instruction for the target. + virtual bool isDesirableToSpeculateSelectLoad() const { + return false; + } + /// Return true if the target supports swifterror attribute. It optimizes /// loads and stores to reading and writing a specific register. virtual bool supportSwiftError() const { Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16248,6 +16248,26 @@ LLD->getBasePtr().getValueType())) return false; + // Avoid combining the load if both loads are GEPs into elements of the same + // struct. TODO: handle cases where the GEP is bitcasted to another type. + if (TLI.isDesirableToSpeculateSelectLoad()) { + if (LLD->getMemOperand()->getValue() && + RLD->getMemOperand()->getValue()) { + const GetElementPtrInst *GEPTrue = + dyn_cast(LLD->getMemOperand()->getValue()); + const GetElementPtrInst *GEPFalse = + dyn_cast(RLD->getMemOperand()->getValue()); + if (GEPTrue && GEPFalse) { + if (GEPTrue->getSourceElementType()->isStructTy() && + GEPFalse->getSourceElementType()->isStructTy() && + GEPTrue->getPointerOperand() == GEPFalse->getPointerOperand() && + GEPTrue->hasAllConstantIndices() && + GEPFalse->hasAllConstantIndices()) + return false; + } + } + } + // Check that the select condition doesn't reach either load. If so, // folding this will induce a cycle into the DAG. If not, this is safe to // xform, so create a select of the addresses. Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -45,6 +45,7 @@ X86RegisterInfo.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp + X86SpeculateSelectLoad.cpp X86Subtarget.cpp X86TargetMachine.cpp X86TargetObjectFile.cpp Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -77,6 +77,10 @@ /// Windows-specific but architecture-neutral preparation. FunctionPass *createX86WinEHStatePass(); +/// Return an IR pass that tries to speculatively load the operands of +/// a select instruction when profitable. +FunctionPass *createX86SpeculateSelectLoadPass(); + /// Return a Machine IR pass that expands X86-specific pseudo /// instructions into a sequence of actual instructions. This pass /// must run after prologue/epilogue insertion and before lowering Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -805,6 +805,10 @@ /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + /// Return true if it is desirable to speculatively load the operands + /// of a select instruction for the target. + bool isDesirableToSpeculateSelectLoad() const override { return true; } + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; Index: lib/Target/X86/X86SpeculateSelectLoad.cpp =================================================================== --- /dev/null +++ lib/Target/X86/X86SpeculateSelectLoad.cpp @@ -0,0 +1,148 @@ +//===-- X86SpeculateSelectLoad- Speculatively load operands of a select --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// For a select instruction where the operands are address calculations +// of two independent loads, the pass tries to speculate the loads and +// feed them into the select instruction, this allows early parallel execution +// of the loads and possibly memory folding into the CMOV instructions later on. +// The pass currently only handles cases where the loads are elements of the +// same struct. +//===---------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86speculateload" + +namespace llvm { +void initializeX86SpeculateSelectLoadPassPass(PassRegistry &); +} + +namespace { + +class X86SpeculateSelectLoadPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid. + + X86SpeculateSelectLoadPass() : FunctionPass(ID) { + initializeX86SpeculateSelectLoadPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + StringRef getPassName() const override { + return "X86 Speculatively load before select instruction"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + +private: + bool OptimizeSelectInst(SelectInst *SI); + const DataLayout *DL; + const TargetTransformInfo *TTI; + SmallVector InstrForRemoval; +}; +} + +FunctionPass *llvm::createX86SpeculateSelectLoadPass() { + return new X86SpeculateSelectLoadPass(); +} + +char X86SpeculateSelectLoadPass::ID = 0; + +INITIALIZE_PASS_BEGIN(X86SpeculateSelectLoadPass, "x86-speculateload", + "X86 Speculatively load before select instruction", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(X86SpeculateSelectLoadPass, "x86-speculateload", + "X86 Speculatively load before select instruction", false, + false) + +bool X86SpeculateSelectLoadPass::OptimizeSelectInst(SelectInst *SI) { + GetElementPtrInst *GEPIT = dyn_cast(SI->getTrueValue()); + GetElementPtrInst *GEPIF = dyn_cast(SI->getFalseValue()); + if (GEPIT == nullptr || GEPIF == nullptr) + return false; + + // The pass currently only handles cases where the loads are elements of the + // same struct and which aren't aggregate. + if (!GEPIT->getSourceElementType()->isStructTy() || + !GEPIF->getSourceElementType()->isStructTy() || + GEPIT->getPointerOperand() != GEPIF->getPointerOperand() || + !GEPIT->hasAllConstantIndices() || !GEPIF->hasAllConstantIndices() || + GEPIT->getNumOperands() != 3 || GEPIF->getNumOperands() != 3) + return false; + + if (!SI->hasOneUse()) + return false; + // Bail out if there is a good chance we'll be loading from two different + // cache lines instead of one. + if (StructType *STy = dyn_cast(GEPIT->getSourceElementType())) { + // Get the indices of the elements in the struct + ConstantInt *Idx1 = dyn_cast(GEPIT->getOperand(2)); + ConstantInt *Idx2 = dyn_cast(GEPIF->getOperand(2)); + const StructLayout *STL = DL->getStructLayout(STy); + if (Idx1 && Idx2 && STL) { + signed offset1 = STL->getElementOffset(Idx1->getZExtValue()); + signed offset2 = STL->getElementOffset(Idx2->getZExtValue()); + unsigned dist = abs(offset1 - offset2); + assert((TTI->getCacheLineSize() > 0) && + "CacheLineSize information is missing for X86"); + if (dist > TTI->getCacheLineSize()) + return false; + } + } + + for (User *U : SI->users()) { + if (LoadInst *LI = dyn_cast(U)) { + if (!LI->isSimple()) + return false; + IRBuilder<> Builder(SI); + LoadInst *LT = Builder.CreateAlignedLoad(GEPIT, LI->getAlignment()); + LoadInst *LF = Builder.CreateAlignedLoad(GEPIF, LI->getAlignment()); + Value *NewSI = Builder.CreateSelect(SI->getCondition(), LT, LF); + LI->replaceAllUsesWith(NewSI); + InstrForRemoval.push_back(LI); + InstrForRemoval.push_back(SI); + + return true; + } + } + return false; +} + +bool X86SpeculateSelectLoadPass::runOnFunction(Function &F) { + if (skipFunction(F) || F.optForSize()) + return false; + + DL = &F.getParent()->getDataLayout(); + TTI = &getAnalysis().getTTI(F); + + bool Changed = false; + + for (auto &BB : F) + for (auto &I : BB) + if (SelectInst *SI = dyn_cast(&I)) + Changed |= OptimizeSelectInst(SI); + + for (auto Instr : InstrForRemoval) + Instr->eraseFromParent(); + + InstrForRemoval.clear(); + + return Changed; +} + Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -59,6 +59,7 @@ void initializeWinEHStatePassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); +void initializeX86SpeculateSelectLoadPassPass(PassRegistry &); } // end namespace llvm @@ -74,6 +75,7 @@ initializeEvexToVexInstPassPass(PR); initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); + initializeX86SpeculateSelectLoadPassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -342,8 +344,10 @@ TargetPassConfig::addIRPasses(); - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + addPass(createX86SpeculateSelectLoadPass()); + } } bool X86PassConfig::addInstSelector() { Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -115,6 +115,8 @@ const Function *Callee) const; bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); + unsigned getCacheLineSize() const; + private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2529,3 +2529,5 @@ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } + +unsigned X86TTIImpl::getCacheLineSize() const { return 64; } Index: test/CodeGen/X86/speculate-select-load.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/speculate-select-load.ll @@ -0,0 +1,70 @@ +; RUN: opt < %s -mcpu=x86-64 -S -x86-speculateload | FileCheck %s +; RUN: opt < %s -mcpu=i386 -S -x86-speculateload | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.S = type { i64, %struct.S*, %struct.S*, i64, i16, i64, i64, i64, i64, i64 } + +;; Selecting between pointers of two members of the same structure with offset +;; smaller than a cache line's size (64 bytes) between them, load speculatively. +; Function Attrs: norecurse nounwind readonly uwtable +define %struct.S* @spec_load(i32 %x, %struct.S* nocapture readnone %A, %struct.S* nocapture readonly %B) local_unnamed_addr #0 { +entry: +; CHECK-LABEL:@spec_load +; CHECK: getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 1 +; CHECK: getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 2 +; CHECK: [[A:%[0-9]+]] = load %struct.S*, %struct.S** %{{.*}}, align 8 +; CHECK: [[B:%[0-9]+]] = load %struct.S*, %struct.S** %{{.*}}, align 8 +; CHECK: select i1 %tobool, %struct.S* [[A]], %struct.S* [[B]] + + %tobool = icmp eq i32 %x, 0 + %b = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 1 + %c = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 2 + %A.addr.0.in = select i1 %tobool, %struct.S** %c, %struct.S** %b + %A.addr.0 = load %struct.S*, %struct.S** %A.addr.0.in, align 8 + ret %struct.S* %A.addr.0 +} + +;; Selecting between pointers of two members of the same structure with offset +;; greater than a cache line's size (64 bytes) between them, thus do not load speculatively. +; Function Attrs: norecurse nounwind readonly uwtable +define i64 @no_spec_load(i32 %x, i64 %A, %struct.S* nocapture readonly %B) local_unnamed_addr #0 { +entry: +; CHECK-LABEL:@no_spec_load +; CHECK: [[A:%[a-z]+]] = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 0 +; CHECK: [[B:%[a-z]+]] = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 9 +; CHECK: select i1 %tobool, i64* [[B]], i64* [[A]] +; CHECK-NEXT: load i64, i64* %{{.*}}, align 8 + + %tobool = icmp eq i32 %x, 0 + %a = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 0 + %j = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 9 + %A.addr.0.in = select i1 %tobool, i64* %j, i64* %a + %A.addr.0 = load i64, i64* %A.addr.0.in, align 8 + ret i64 %A.addr.0 +} + +;; Selecting into an aggregate type of the struct, do not load speculatively +%struct.S2 = type { i32, [10 x i32] } + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @no_load_agg(%struct.S2* nocapture readonly %s1, %struct.S2* nocapture readnone %s2, i32 %x) local_unnamed_addr #0 { +entry: +; CHECK-LABEL:@no_load_agg +; CHECK: [[A:%[a-z]+]] = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 0 +; CHECK: [[B:%[a-z]+]] = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1, i64 10 +; CHECK: %retval.0.in = select i1 %tobool, i32* [[B]], i32* [[A]] + + %tobool = icmp eq i32 %x, 0 + %a = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 0 + %arrayidx = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1, i64 10 + %retval.0.in = select i1 %tobool, i32* %arrayidx, i32* %a + %retval.0 = load i32, i32* %retval.0.in, align 4 + ret i32 %retval.0 +} + + + +attributes #0 = { norecurse nounwind readonly uwtable "target-cpu"="core-avx2" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"} +