Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -508,6 +508,10 @@ return false; } + /// Return true if the target prefers arithmetic instructions instead of + /// load from memory. + virtual bool isSubShrCheaperThanLoad() const { return false; } + /// Return true if target supports floating point exceptions. bool hasFloatingPointExceptions() const { return HasFloatingPointExceptions; Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -6039,6 +6039,67 @@ return true; } +// In case of loading integer value from an array of constants which is defined +// as follows: +// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} +// array[idx] can be replaced with (0xFFFFFFFF >> (sub 32, idx)) +static bool ReplaceLoadWithSubShr(const TargetLowering *TLI, LoadInst *LI) { + if (!TLI || !TLI->isSubShrCheaperThanLoad()) + return false; + + if (GetElementPtrInst *GEP = dyn_cast(LI->getOperand(0))) { + if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) { + if (GV->isConstant() && GV->hasDefinitiveInitializer()) { + + Constant *Init = GV->getInitializer(); + Type *Ty = Init->getType(); + if ((!isa(Init) && !isa(Init)) || + !Ty->getArrayElementType()->isIntegerTy() || + Ty->getArrayNumElements() > + Ty->getArrayElementType()->getScalarSizeInBits()) + return nullptr; + + // Check if the array's constant elements are suitable to our case. + uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); + for (uint64_t i = 0; i < ArrayElementCount; i++) { + ConstantInt *Elem = + dyn_cast(Init->getAggregateElement(i)); + if (Elem->getZExtValue() != (((uint64_t)1 << i) - 1)) + return nullptr; + } + + IRBuilder<> Builder(LI->getContext()); + Builder.SetInsertPoint(LI); + + // Do the transformation (assuming 32-bit elements): + // -> elemPtr = getelementptr @array, 0, idx + // elem = load elemPtr + // <- hiBit = 32 - idx + // elem = 0xFFFFFFFF >> hiBit + Type *ElemTy = Ty->getArrayElementType(); + Value *LoadIdx = Builder.CreateZExtOrTrunc(GEP->getOperand(2), ElemTy); + + APInt ElemSize = + APInt(ElemTy->getScalarSizeInBits(), ElemTy->getScalarSizeInBits()); + Constant *ElemSizeConst = Constant::getIntegerValue(ElemTy, ElemSize); + Value *Sub = Builder.CreateSub(ElemSizeConst, LoadIdx); + + Constant *AllOnes = Constant::getAllOnesValue(ElemTy); + Value *LShr = Builder.CreateLShr(AllOnes, Sub); + + LI->replaceAllUsesWith(LShr); + RecursivelyDeleteTriviallyDeadInstructions(LI); + + // Remove the array if possible. + if (GV->use_empty() && GV->isDiscardableIfUnused()) + GV->eraseFromParent(); + return true; + } + } + } + return false; +} + bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. @@ -6093,6 +6154,11 @@ if (LoadInst *LI = dyn_cast(I)) { LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); + + // Try to get rid of the load if possible (replace with sub & shr). + if (ReplaceLoadWithSubShr(TLI, LI)) + return true; + if (TLI) { bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -884,6 +884,10 @@ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + /// Return true if the target prefers arithmetic instructions instead of + /// load from memory. + bool isSubShrCheaperThanLoad() const override; + /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -24511,6 +24511,10 @@ return nullptr; } +/// Return true if the target prefers arithmetic instructions instead of +/// load from memory. +bool X86TargetLowering::isSubShrCheaperThanLoad() const { return true; } + /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, Index: test/CodeGen/X86/replace-load-with-sub-shr.ll =================================================================== --- test/CodeGen/X86/replace-load-with-sub-shr.ll +++ test/CodeGen/X86/replace-load-with-sub-shr.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s + +@fill_table32 = internal unnamed_addr constant [32 x i32] [i32 0, i32 1, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047, i32 4095, i32 8191, i32 16383, i32 32767, i32 65535, i32 131071, i32 262143, i32 524287, i32 1048575, i32 2097151, i32 4194303, i32 8388607, i32 16777215, i32 33554431, i32 67108863, i32 134217727, i32 268435455, i32 536870911, i32 1073741823, i32 2147483647], align 16 +@fill_table64 = internal unnamed_addr constant [64 x i64] [i64 0, i64 1, i64 3, i64 7, i64 15, i64 31, i64 63, i64 127, i64 255, i64 511, i64 1023, i64 2047, i64 4095, i64 8191, i64 16383, i64 32767, i64 65535, i64 131071, i64 262143, i64 524287, i64 1048575, i64 2097151, i64 4194303, i64 8388607, i64 16777215, i64 33554431, i64 67108863, i64 134217727, i64 268435455, i64 536870911, i64 1073741823, i64 2147483647, i64 4294967295, i64 8589934591, i64 17179869183, i64 34359738367, i64 68719476735, i64 137438953471, i64 274877906943, i64 549755813887, i64 1099511627775, i64 2199023255551, i64 4398046511103, i64 8796093022207, i64 17592186044415, i64 35184372088831, i64 70368744177663, i64 140737488355327, i64 281474976710655, i64 562949953421311, i64 1125899906842623, i64 2251799813685247, i64 4503599627370495, i64 9007199254740991, i64 18014398509481983, i64 36028797018963967, i64 72057594037927935, i64 144115188075855871, i64 288230376151711743, i64 576460752303423487, i64 1152921504606846975, i64 2305843009213693951, i64 4611686018427387903, i64 9223372036854775807], align 16 + +define i32 @f32(i32 %y) local_unnamed_addr { +; CHECK-LABEL: f32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: shrxl %eax, %ecx, %eax +; CHECK-NEXT: retq +entry: + %idxprom = sext i32 %y to i64 + %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + ret i32 %0 +} + +define i64 @f64(i64 %y) local_unnamed_addr { +; CHECK-LABEL: f64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: movq $-1, %rcx +; CHECK-NEXT: shrxq %rax, %rcx, %rax +; CHECK-NEXT: retq +entry: + %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y + %0 = load i64, i64* %arrayidx, align 8 + ret i64 %0 +} + +define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr { +; CHECK-LABEL: f32_bzhi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +entry: + %idxprom = sext i32 %y to i64 + %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %and = and i32 %0, %x + ret i32 %and +} + +define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr { +; CHECK-LABEL: f64_bzhi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y + %0 = load i64, i64* %arrayidx, align 8 + %and = and i64 %0, %x + ret i64 %and +} +