Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32453,6 +32453,124 @@ return DAG.getBitcast(N->getValueType(0), Shift); } +// Get the index node from the lowered DAG of a GEP IR instruction with one +// indexing dimension. +static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { + if (Ld->isIndexed()) + return SDValue(); + + SDValue Base = Ld->getBasePtr(); + + if (Base.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue ShiftedIndex = Base.getOperand(0); + + if (ShiftedIndex.getOpcode() != ISD::SHL) + return SDValue(); + + return ShiftedIndex.getOperand(0); + +} + +static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { + if (Subtarget.hasBMI2() && VT.isScalarInteger()) { + switch (VT.getSizeInBits()) { + default: return false; + case 64: return Subtarget.is64Bit() ? true : false; + case 32: return true; + } + } + return false; +} + +// This function recognizes cases where X86 bzhi instruction can replace and +// 'and-load' sequence. +// In case of loading integer value from an array of constants which is defined +// as follows: +// +// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} +// +// then applying a bitwise and on the result with another input. +// It's equivalent to performing bzhi (zero high bits) on the input, with the +// same index of the load. +static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + // Check if subtarget has BZHI instruction for the node's type + if (!hasBZHI(Subtarget, VT)) + return SDValue(); + + // Try matching the pattern for both operands. + for (unsigned i = 0; i < 2; i++) { + SDValue N = Node->getOperand(i); + LoadSDNode *Ld = dyn_cast(N.getNode()); + + // continue if the operand is not a load instruction + if (!Ld) + return SDValue(); + + const Value *MemOp = Ld->getMemOperand()->getValue(); + + if (!MemOp) + return SDValue(); + + if (const GetElementPtrInst *GEP = dyn_cast(MemOp)) { + if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) { + if (GV->isConstant() && GV->hasDefinitiveInitializer()) { + + Constant *Init = GV->getInitializer(); + Type *Ty = Init->getType(); + if (!isa(Init) || + !Ty->getArrayElementType()->isIntegerTy() || + Ty->getArrayElementType()->getScalarSizeInBits() != + VT.getSizeInBits() || + Ty->getArrayNumElements() > + Ty->getArrayElementType()->getScalarSizeInBits()) + continue; + + // Check if the array's constant elements are suitable to our case. + uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); + bool ConstantsMatch = true; + for (uint64_t j = 0; j < ArrayElementCount; j++) { + ConstantInt *Elem = + dyn_cast(Init->getAggregateElement(j)); + if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { + ConstantsMatch = false; + break; + } + } + if (!ConstantsMatch) + continue; + + // Do the transformation (For 32-bit type): + // -> (and (load arr[idx]), inp) + // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) + // that will be replaced with one bzhi instruction. + SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); + SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT); + + // Get the Node which indexes into the array. + SDValue Index = getIndexFromUnindexedLoad(Ld); + if (!Index) + return SDValue(); + Index = DAG.getZExtOrTrunc(Index, dl, VT); + + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index); + + SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); + SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); + + return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); + } + } + } + } + return SDValue(); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -32471,6 +32589,9 @@ if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; + if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) + return R; + EVT VT = N->getValueType(0); // Attempt to recursively combine a bitmask AND with shuffles. Index: test/CodeGen/X86/replace-load-and-with-bzhi.ll =================================================================== --- test/CodeGen/X86/replace-load-and-with-bzhi.ll +++ test/CodeGen/X86/replace-load-and-with-bzhi.ll @@ -10,17 +10,14 @@ define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr { ; CHECK-LABEL: f32_bzhi: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movslq %esi, %rax -; CHECK-NEXT: andl fill_table32(,%rax,4), %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: ret{{[l|q]}} +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq ; ; CHECK32-LABEL: f32_bzhi: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK32-NEXT: movl fill_table32(,%eax,4), %eax -; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK32-NEXT: ret{{[l|q]}} +; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: retl entry: %idxprom = sext i32 %y to i64 %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom @@ -32,17 +29,14 @@ define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr { ; CHECK-LABEL: f32_bzhi_partial: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movslq %esi, %rax -; CHECK-NEXT: andl fill_table32_partial(,%rax,4), %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: ret{{[l|q]}} +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq ; ; CHECK32-LABEL: f32_bzhi_partial: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK32-NEXT: movl fill_table32_partial(,%eax,4), %eax -; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK32-NEXT: ret{{[l|q]}} +; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: retl entry: %idxprom = sext i32 %y to i64 %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom @@ -54,9 +48,8 @@ define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr { ; CHECK-LABEL: f64_bzhi: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: andq fill_table64(,%rsi,8), %rdi -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: ret{{[l|q]}} +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq ; ; CHECK32-LABEL: f64_bzhi: ; CHECK32: # BB#0: # %entry @@ -65,7 +58,7 @@ ; CHECK32-NEXT: movl fill_table64(,%eax,8), %eax ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx -; CHECK32-NEXT: ret{{[l|q]}} +; CHECK32-NEXT: retl entry: %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y %0 = load i64, i64* %arrayidx, align 8 @@ -76,9 +69,8 @@ define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr { ; CHECK-LABEL: f64_bzhi_partial: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: andq fill_table64_partial(,%rsi,8), %rdi -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: ret{{[l|q]}} +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq ; ; CHECK32-LABEL: f64_bzhi_partial: ; CHECK32: # BB#0: # %entry @@ -87,7 +79,7 @@ ; CHECK32-NEXT: movl fill_table64_partial(,%eax,8), %eax ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx -; CHECK32-NEXT: ret{{[l|q]}} +; CHECK32-NEXT: retl entry: %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y %0 = load i64, i64* %arrayidx, align 8