Index: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -451,6 +451,7 @@ } bool foldLoadStoreIntoMemOperand(SDNode *Node); + bool matchBEXTRFromAndImm(SDNode *Node); bool matchBEXTR(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; @@ -1340,6 +1341,64 @@ return false; } +// Transform "(X >> SHIFT) & (MASK << C1)" to +// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be +// matched to a BEXTR later. Returns false if the simplification is performed. +static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM, + const X86Subtarget &Subtarget) { + if (Shift.getOpcode() != ISD::SRL || + !isa(Shift.getOperand(1)) || + !Shift.hasOneUse() || !N.hasOneUse()) + return true; + + // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. + if (!Subtarget.hasTBM() && + !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) + return true; + + // We need to ensure that mask is a continuous run of bits. + if (!isShiftedMask_64(Mask)) return true; + + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + + // The amount of shift we're trying to fit into the addressing mode is taken + // from the trailing zeros of the mask. + unsigned AMShiftAmt = countTrailingZeros(Mask); + + // There is nothing we can do here unless the mask is removing some bits. + // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. + if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); + SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); + SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); + SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + insertDAGNode(DAG, N, NewSRLAmt); + insertDAGNode(DAG, N, NewSRL); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, NewAnd); + insertDAGNode(DAG, N, NewSHLAmt); + insertDAGNode(DAG, N, NewSHL); + DAG.ReplaceAllUsesWith(N, NewSHL); + + AM.Scale = 1 << AMShiftAmt; + AM.IndexReg = NewAnd; + return false; +} + bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); @@ -1620,6 +1679,11 @@ // a scale on the outside of the mask. if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) return false; + + // Try to fold the mask and shift into BEXTR and scale. + if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) + return false; + break; } } @@ -2646,6 +2710,92 @@ return true; } +// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. +bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { + MVT NVT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + // If we have TBM we can use an immediate for the control. If we have BMI + // we should only do this if the BEXTR instruction is implemented well. + // Otherwise moving the control into a register makes this more costly. + // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM + // hoisting the move immediate would make it worthwhile with a less optimal + // BEXTR? + if (!Subtarget->hasTBM() && + !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) + return false; + + // Must have a shift right. + if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) + return false; + + // Shift can't have additional users. + if (!N0->hasOneUse()) + return false; + + // Only supported for 32 and 64 bits. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + // Shift amount and RHS of and must be constant. + ConstantSDNode *MaskCst = dyn_cast(N1); + ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); + if (!MaskCst || !ShiftCst) + return false; + + // And RHS must be a mask. + uint64_t Mask = MaskCst->getZExtValue(); + if (!isMask_64(Mask)) + return false; + + uint64_t Shift = ShiftCst->getZExtValue(); + uint64_t MaskSize = countPopulation(Mask); + + // Don't interfere with something that can be handled by extracting AH. + // TODO: If we are able to fold a load, BEXTR might still be better than AH. + if (Shift == 8 && MaskSize == 8) + return false; + + // Make sure we are only using bits that were in the original value, not + // shifted in. + if (Shift + MaskSize > NVT.getSizeInBits()) + return false; + + SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + + // BMI requires the immediate to placed in a register. + if (!Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; + New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0); + } + + MachineSDNode *NewNode; + SDValue Input = N0->getOperand(0); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(Input.getValue(1), SDValue(NewNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(NewNode, {cast(Input)->getMemOperand()}); + } else { + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); + } + + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); + return true; +} + // Emit a PCMISTR(I/M) instruction. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, @@ -2953,6 +3103,8 @@ break; case ISD::AND: + if (matchBEXTRFromAndImm(Node)) + return; if (matchBEXTR(Node)) return; if (AndImmShrink && shrinkAndImmediate(Node)) Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -35278,69 +35278,6 @@ return SDValue(); } -static bool hasBEXTR(const X86Subtarget &Subtarget, EVT VT) { - // If we have TBM we can use an immediate for the control. If we have BMI - // we should only do this if the BEXTR instruction is implemented well. - // Otherwise moving the control into a register makes this more costly. - // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM - // hoisting the move immediate would make it worthwhile with a less optimal - // BEXTR? - if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) - return false; - return (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit())); -} - -// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. -static SDValue combineAndIntoBEXTR(SDNode *Node, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - EVT NVT = Node->getValueType(0); - SDLoc dl(Node); - - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); - - // Check if subtarget has BEXTR instruction for the node's type - if (!hasBEXTR(Subtarget, NVT)) - return SDValue(); - - // Must have a shift right. - if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) - return SDValue(); - - // Shift can't have additional users. - if (!N0->hasOneUse()) - return SDValue(); - - // Shift amount and RHS of and must be constant. - ConstantSDNode *MaskCst = dyn_cast(N1); - ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); - if (!MaskCst || !ShiftCst) - return SDValue(); - - // And RHS must be a mask. - uint64_t Mask = MaskCst->getZExtValue(); - if (!isMask_64(Mask)) - return SDValue(); - - uint64_t Shift = ShiftCst->getZExtValue(); - uint64_t MaskSize = countPopulation(Mask); - - // Don't interfere with something that can be handled by extracting AH. - // TODO: If we are able to fold a load, BEXTR might still be better than AH. - if (Shift == 8 && MaskSize == 8) - return SDValue(); - - // Make sure we are only using bits that were in the original value, not - // shifted in. - if (Shift + MaskSize > NVT.getSizeInBits()) - return SDValue(); - - // Create a BEXTR node. - SDValue C = DAG.getConstant(Shift | (MaskSize << 8), dl, NVT); - SDValue New = DAG.getNode(X86ISD::BEXTR, dl, NVT, N0->getOperand(0), C); - return New; -} - // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. // Turn it into series of XORs and a setnp. static SDValue combineParity(SDNode *N, SelectionDAG &DAG, @@ -35442,9 +35379,6 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue R = combineAndIntoBEXTR(N, DAG, Subtarget)) - return R; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; Index: llvm/trunk/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrCompiler.td +++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td @@ -2135,17 +2135,3 @@ let Predicates = [HasMOVBE] in { def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; } - -// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that -// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch -// of manual code for folding loads. -let Predicates = [HasBMI, NoTBM] in { - def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), - (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>; - def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), - (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>; - def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2), - (BEXTR64rr GR64:$src1, (MOV32ri64 mov64imm32:$src2))>; - def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2), - (BEXTR64rm addr:$src1, (MOV32ri64 mov64imm32:$src2))>; -} // HasBMI, NoTBM Index: llvm/trunk/test/CodeGen/X86/extract-bits.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/extract-bits.ll +++ llvm/trunk/test/CodeGen/X86/extract-bits.ll @@ -5568,23 +5568,69 @@ ; https://bugs.llvm.org/show_bug.cgi?id=38938 define void @pr38938(i32* %a0, i64* %a1) { -; X86-LABEL: pr38938: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: shrl $19, %ecx -; X86-NEXT: andl $4092, %ecx # imm = 0xFFC -; X86-NEXT: incl (%eax,%ecx) -; X86-NEXT: retl +; X86-NOBMI-LABEL: pr38938: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl (%ecx), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $4092, %ecx # imm = 0xFFC +; X86-NOBMI-NEXT: incl (%eax,%ecx) +; X86-NOBMI-NEXT: retl ; -; X64-LABEL: pr38938: -; X64: # %bb.0: -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: shrq $19, %rax -; X64-NEXT: andl $4092, %eax # imm = 0xFFC -; X64-NEXT: incl (%rdi,%rax) -; X64-NEXT: retq +; X86-BMI1NOTBM-LABEL: pr38938: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl $2581, %edx # imm = 0xA15 +; X86-BMI1NOTBM-NEXT: bextrl %edx, (%ecx), %ecx +; X86-BMI1NOTBM-NEXT: incl (%eax,%ecx,4) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: pr38938: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1TBM-NEXT: bextrl $2581, (%ecx), %ecx # imm = 0xA15 +; X86-BMI1TBM-NEXT: incl (%eax,%ecx,4) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: pr38938: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl $2581, %edx # imm = 0xA15 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %edx, (%ecx), %ecx +; X86-BMI1NOTBMBMI2-NEXT: incl (%eax,%ecx,4) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: pr38938: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: movq (%rsi), %rax +; X64-NOBMI-NEXT: shrq $19, %rax +; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC +; X64-NOBMI-NEXT: incl (%rdi,%rax) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: pr38938: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $2581, %eax # imm = 0xA15 +; X64-BMI1NOTBM-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMI1NOTBM-NEXT: incl (%rdi,%rax,4) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: pr38938: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15 +; X64-BMI1TBM-NEXT: incl (%rdi,%rax,4) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: pr38938: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $2581, %eax # imm = 0xA15 +; X64-BMI1NOTBMBMI2-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMI1NOTBMBMI2-NEXT: incl (%rdi,%rax,4) +; X64-BMI1NOTBMBMI2-NEXT: retq %tmp = load i64, i64* %a1, align 8 %tmp1 = lshr i64 %tmp, 21 %tmp2 = and i64 %tmp1, 1023 Index: llvm/trunk/test/CodeGen/X86/tbm_patterns.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/tbm_patterns.ll +++ llvm/trunk/test/CodeGen/X86/tbm_patterns.ll @@ -53,7 +53,8 @@ ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; CHECK-NEXT: shrl $4, %edi +; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF ; CHECK-NEXT: cmovnel %edx, %eax ; CHECK-NEXT: retq %t0 = lshr i32 %a, 4 @@ -113,7 +114,8 @@ ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; CHECK-NEXT: shrl $4, %edi +; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF ; CHECK-NEXT: cmovneq %rdx, %rax ; CHECK-NEXT: retq %t0 = lshr i64 %a, 4