diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34053,6 +34053,17 @@ } break; } + case X86ISD::PDEP: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + // Zeros are retained from the mask operand. But not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + break; + } case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: @@ -38373,6 +38384,34 @@ break; } + case X86ISD::PDEP: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros(); + APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); + + // If the demanded bits has leading zeroes, we don't demand those from the + // mask. + if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) + return true; + + // The number of possible 1s in the mask determines the number of LSBs of + // operand 0 used. Undemanded bits from the mask don't matter so filter + // them before counting. + KnownBits Known2; + uint64_t Count = (~Known.Zero & LoMask).countPopulation(); + APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); + if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) + return true; + + // Zeroes are retained from the mask, but not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + return false; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -49580,6 +49619,17 @@ return SDValue(); } +static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + unsigned NumBits = N->getSimpleValueType(0).getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBits), DCI)) + return SDValue(N, 0); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -49750,6 +49800,7 @@ case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI); case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); + case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/bmi2-x86_64.ll b/llvm/test/CodeGen/X86/bmi2-x86_64.ll --- a/llvm/test/CodeGen/X86/bmi2-x86_64.ll +++ b/llvm/test/CodeGen/X86/bmi2-x86_64.ll @@ -44,9 +44,9 @@ define i64 @pdep64_anyext(i32 %x) { ; CHECK-LABEL: pdep64_anyext: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; CHECK-NEXT: pdepq %rcx, %rax, %rax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; CHECK-NEXT: pdepq %rax, %rdi, %rax ; CHECK-NEXT: retq %x1 = sext i32 %x to i64 %tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x1, i64 6148914691236517205) diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -86,9 +86,8 @@ ; ; X64-LABEL: pdep32_anyext: ; X64: # %bb.0: -; X64-NEXT: movswl %di, %eax -; X64-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA -; X64-NEXT: pdepl %ecx, %eax, %eax +; X64-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; X64-NEXT: pdepl %eax, %edi, %eax ; X64-NEXT: retq %x1 = sext i16 %x to i32 %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x1, i32 -1431655766) @@ -101,14 +100,12 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: pdepl %ecx, %eax, %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: retl ; ; X64-LABEL: pdep32_demandedbits: ; X64: # %bb.0: ; X64-NEXT: movl $1431655765, %eax # imm = 0x55555555 ; X64-NEXT: pdepl %eax, %edi, %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X64-NEXT: retq %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 1431655765) %tmp2 = and i32 %tmp, 1431655765 @@ -125,8 +122,7 @@ ; ; X64-LABEL: pdep32_demandedbits2: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: pdepl %esi, %eax, %eax +; X64-NEXT: pdepl %esi, %edi, %eax ; X64-NEXT: andl $128, %eax ; X64-NEXT: retq %tmp = and i32 %x, 255 @@ -146,8 +142,7 @@ ; ; X64-LABEL: pdep32_demandedbits_mask: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: pdepl %eax, %edi, %eax +; X64-NEXT: pdepl %esi, %edi, %eax ; X64-NEXT: andl $32768, %eax # imm = 0x8000 ; X64-NEXT: retq %tmp = sext i16 %y to i32 @@ -167,8 +162,7 @@ ; ; X64-LABEL: pdep32_demandedbits_mask2: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: pdepl %eax, %edi, %eax +; X64-NEXT: pdepl %esi, %edi, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq %tmp = sext i16 %y to i32 @@ -182,19 +176,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: pdepl %ecx, %eax, %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: imull %ecx, %eax +; X86-NEXT: pdepl %ecx, %eax, %eax +; X86-NEXT: imull %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: pdep32_knownbits: ; X64: # %bb.0: ; X64-NEXT: movl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: pdepl %eax, %edi, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: pdepl %eax, %edi, %eax +; X64-NEXT: imull %eax, %eax ; X64-NEXT: retq %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 1431655765) %tmp2 = and i32 %tmp, 1431655765 @@ -207,19 +197,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl $-256, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $-256, %eax -; X86-NEXT: imull %ecx, %eax +; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax +; X86-NEXT: imull %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: pdep32_knownbits2: ; X64: # %bb.0: ; X64-NEXT: andl $-256, %edi -; X64-NEXT: pdepl %esi, %edi, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: andl $-256, %eax -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: pdepl %esi, %edi, %eax +; X64-NEXT: imull %eax, %eax ; X64-NEXT: retq %tmp = and i32 %x, -256 %tmp2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %tmp, i32 %y)