Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32159,6 +32159,124 @@ return DAG.getBitcast(N->getValueType(0), Shift); } +// Get the index node from the lowered DAG of a GEP IR instruction with one +// indexing dimension. +static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { + if (Ld->isIndexed()) + return SDValue(); + + SDValue Base = Ld->getBasePtr(); + + if (Base.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue ShiftedIndex = Base.getOperand(0); + + if (ShiftedIndex.getOpcode() != ISD::SHL) + return SDValue(); + + return ShiftedIndex.getOperand(0); + +} + +static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { + if (Subtarget.hasBMI2() && VT.isScalarInteger()) { + switch (VT.getSizeInBits()) { + default: return false; + case 64: return Subtarget.is64Bit() ? true : false; + case 32: return true; + } + } + return true; +} + +// This function recognizes cases where X86 bzhi instruction can replace and +// 'and-load' sequence. +// In case of loading integer value from an array of constants which is defined +// as follows: +// +// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} +// +// then applying a bitwise and on the result with another input. +// It's equivalent to performing bzhi (zero high bits) on the input, with the +// same index of the load. +static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + // Check if subtarget has BZHI instruction for the node's type + if (!hasBZHI(Subtarget, VT)) + return SDValue(); + + // Try matching the pattern for both operands. + for (unsigned i = 0; i < 2; i++) { + SDValue N = Node->getOperand(i); + LoadSDNode *Ld = dyn_cast(N.getNode()); + + // continue if the operand is not a load instruction + if (!Ld) + return SDValue(); + + const Value *MemOp = Ld->getMemOperand()->getValue(); + + if (!MemOp) + return SDValue(); + + if (const GetElementPtrInst *GEP = dyn_cast(MemOp)) { + if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) { + if (GV->isConstant() && GV->hasDefinitiveInitializer()) { + + Constant *Init = GV->getInitializer(); + Type *Ty = Init->getType(); + if (!isa(Init) || + !Ty->getArrayElementType()->isIntegerTy() || + Ty->getArrayElementType()->getScalarSizeInBits() != + VT.getSizeInBits() || + Ty->getArrayNumElements() > + Ty->getArrayElementType()->getScalarSizeInBits()) + continue; + + // Check if the array's constant elements are suitable to our case. + uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); + bool ConstantsMatch = true; + for (uint64_t j = 0; j < ArrayElementCount; j++) { + ConstantInt *Elem = + dyn_cast(Init->getAggregateElement(j)); + if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { + ConstantsMatch = false; + break; + } + } + if (!ConstantsMatch) + continue; + + // Do the transformation (For 32-bit type): + // -> (and (load arr[idx]), inp) + // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) + // that will be replaced with one bzhi instruction. + SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); + SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT); + + // Get the Node which indexes into the array. + SDValue Index = getIndexFromUnindexedLoad(Ld); + if (!Index) + return SDValue(); + Index = DAG.getZExtOrTrunc(Index, dl, VT); + + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index); + + SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); + SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); + + return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); + } + } + } + } + return SDValue(); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -32177,6 +32295,9 @@ if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; + if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) + return R; + EVT VT = N->getValueType(0); // Attempt to recursively combine a bitmask AND with shuffles. Index: test/CodeGen/X86/replace-load-and-with-bzhi.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/replace-load-and-with-bzhi.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi2 | FileCheck %s -check-prefix=CHECK32 + +@fill_table32 = internal unnamed_addr constant [32 x i32] [i32 0, i32 1, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047, i32 4095, i32 8191, i32 16383, i32 32767, i32 65535, i32 131071, i32 262143, i32 524287, i32 1048575, i32 2097151, i32 4194303, i32 8388607, i32 16777215, i32 33554431, i32 67108863, i32 134217727, i32 268435455, i32 536870911, i32 1073741823, i32 2147483647], align 16 +@fill_table32_partial = internal unnamed_addr constant [17 x i32] [i32 0, i32 1, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047, i32 4095, i32 8191, i32 16383, i32 32767, i32 65535], align 16 +@fill_table64 = internal unnamed_addr constant [64 x i64] [i64 0, i64 1, i64 3, i64 7, i64 15, i64 31, i64 63, i64 127, i64 255, i64 511, i64 1023, i64 2047, i64 4095, i64 8191, i64 16383, i64 32767, i64 65535, i64 131071, i64 262143, i64 524287, i64 1048575, i64 2097151, i64 4194303, i64 8388607, i64 16777215, i64 33554431, i64 67108863, i64 134217727, i64 268435455, i64 536870911, i64 1073741823, i64 2147483647, i64 4294967295, i64 8589934591, i64 17179869183, i64 34359738367, i64 68719476735, i64 137438953471, i64 274877906943, i64 549755813887, i64 1099511627775, i64 2199023255551, i64 4398046511103, i64 8796093022207, i64 17592186044415, i64 35184372088831, i64 70368744177663, i64 140737488355327, i64 281474976710655, i64 562949953421311, i64 1125899906842623, i64 2251799813685247, i64 4503599627370495, i64 9007199254740991, i64 18014398509481983, i64 36028797018963967, i64 72057594037927935, i64 144115188075855871, i64 288230376151711743, i64 576460752303423487, i64 1152921504606846975, i64 2305843009213693951, i64 4611686018427387903, i64 9223372036854775807], align 16 +@fill_table64_partial = internal unnamed_addr constant [51 x i64] [i64 0, i64 1, i64 3, i64 7, i64 15, i64 31, i64 63, i64 127, i64 255, i64 511, i64 1023, i64 2047, i64 4095, i64 8191, i64 16383, i64 32767, i64 65535, i64 131071, i64 262143, i64 524287, i64 1048575, i64 2097151, i64 4194303, i64 8388607, i64 16777215, i64 33554431, i64 67108863, i64 134217727, i64 268435455, i64 536870911, i64 1073741823, i64 2147483647, i64 4294967295, i64 8589934591, i64 17179869183, i64 34359738367, i64 68719476735, i64 137438953471, i64 274877906943, i64 549755813887, i64 1099511627775, i64 2199023255551, i64 4398046511103, i64 8796093022207, i64 17592186044415, i64 35184372088831, i64 70368744177663, i64 140737488355327, i64 281474976710655, i64 562949953421311, i64 1125899906842623], align 16 + +define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr { +; CHECK-LABEL: f32_bzhi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: f32_bzhi: +; CHECK32: # BB#0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: retl +entry: + %idxprom = sext i32 %y to i64 + %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %and = and i32 %0, %x + ret i32 %and +} + +define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr { +; CHECK-LABEL: f32_bzhi_partial: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: f32_bzhi_partial: +; CHECK32: # BB#0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: retl +entry: + %idxprom = sext i32 %y to i64 + %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %and = and i32 %0, %x + ret i32 %and +} + +define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr { +; CHECK-LABEL: f64_bzhi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: f64_bzhi: +; CHECK32: # BB#0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl fill_table64+4(,%eax,8), %edx +; CHECK32-NEXT: movl fill_table64(,%eax,8), %eax +; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: retl +entry: + %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y + %0 = load i64, i64* %arrayidx, align 8 + %and = and i64 %0, %x + ret i64 %and +} + +define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr { +; CHECK-LABEL: f64_bzhi_partial: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: f64_bzhi_partial: +; CHECK32: # BB#0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl fill_table64_partial+4(,%eax,8), %edx +; CHECK32-NEXT: movl fill_table64_partial(,%eax,8), %eax +; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: retl +entry: + %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y + %0 = load i64, i64* %arrayidx, align 8 + %and = and i64 %0, %x + ret i64 %and +} +