Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38773,6 +38773,47 @@ return DAG.getBitcast(N->getValueType(0), Shift); } +// Look for (and (srl X, C1), C2) where C2 is a mask. Try to convert it to +// (srl (shl (X, C3), C4)). This avoids a constant pool for the AND. +// FIXME: This only works for masks that aren't a 8, 16, or 32 bits since +// those are canonicalized to a shuffle. +static SDValue combineAndSrlToSrlShl(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + EVT VT = N->getValueType(0); + if (!VT.isSimple()) + return SDValue(); + + if (N0.getOpcode() != X86ISD::VSRLI || !N0.hasOneUse()) + return SDValue(); + + // And RHS should be a constant mask. + APInt SplatVal; + if (!ISD::isConstantSplatVector(N1.getNode(), SplatVal) || + !SplatVal.isMask()) + return SDValue(); + + // Make sure we have a constant shift amount. + const APInt &N0C1 = N0.getConstantOperandAPInt(1); + unsigned OpSizeInBits = VT.getSimpleVT().getScalarSizeInBits(); + unsigned ShiftAmount = N0C1.getLimitedValue(OpSizeInBits); + unsigned MaskSize = SplatVal.countTrailingOnes(); + + // If the shift is bringing in 0s into the bits in the mask, then this + // needs more simplification elsewhere. + if (OpSizeInBits - ShiftAmount <= MaskSize) + return SDValue(); + + SDLoc dl(N); + unsigned ShlAmt = OpSizeInBits - ShiftAmount - MaskSize; + SDValue Shl = DAG.getNode(X86ISD::VSHLI, dl, VT, N0.getOperand(0), + DAG.getConstant(ShlAmt, dl, MVT::i8)); + return DAG.getNode(X86ISD::VSRLI, dl, VT, Shl, + DAG.getConstant(OpSizeInBits - MaskSize, dl, MVT::i8)); +} + // Get the index node from the lowered DAG of a GEP IR instruction with one // indexing dimension. static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { @@ -39023,6 +39064,9 @@ if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; + if (SDValue V = combineAndSrlToSrlShl(N, DAG, Subtarget)) + return V; + if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; Index: llvm/test/CodeGen/X86/vector-lshr-and.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/vector-lshr-and.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX512BW + + +define <2 x i64> @v2i64(<2 x i64> %x) { +; SSE-LABEL: v2i64: +; SSE: # %bb.0: +; SSE-NEXT: psllq $11, %xmm0 +; SSE-NEXT: psrlq $58, %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $11, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlq $58, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $11, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlq $58, %xmm0, %xmm0 +; AVX512BW-NEXT: retq + %a = lshr <2 x i64> %x, + %b = and <2 x i64> %a, + ret <2 x i64> %b +} + +define <4 x i64> @v4i64(<4 x i64> %x) { +; SSE-LABEL: v4i64: +; SSE: # %bb.0: +; SSE-NEXT: psllq $39, %xmm0 +; SSE-NEXT: psrlq $55, %xmm0 +; SSE-NEXT: psllq $39, %xmm1 +; SSE-NEXT: psrlq $55, %xmm1 +; SSE-NEXT: retq +; +; AVX2-LABEL: v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $39, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $55, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $39, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $55, %ymm0, %ymm0 +; AVX512BW-NEXT: retq + %a = lshr <4 x i64> %x, + %b = and <4 x i64> %a, + ret <4 x i64> %b +} + +define <8 x i64> @v8i64(<8 x i64> %x) { +; SSE-LABEL: v8i64: +; SSE: # %bb.0: +; SSE-NEXT: psllq $12, %xmm0 +; SSE-NEXT: psrlq $47, %xmm0 +; SSE-NEXT: psllq $12, %xmm1 +; SSE-NEXT: psrlq $47, %xmm1 +; SSE-NEXT: psllq $12, %xmm2 +; SSE-NEXT: psrlq $47, %xmm2 +; SSE-NEXT: psllq $12, %xmm3 +; SSE-NEXT: psrlq $47, %xmm3 +; SSE-NEXT: retq +; +; AVX2-LABEL: v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $12, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $47, %ymm0, %ymm0 +; AVX2-NEXT: vpsllq $12, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $47, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v8i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $12, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $47, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %a = lshr <8 x i64> %x, + %b = and <8 x i64> %a, + ret <8 x i64> %b +} + +; FIXME: We don't use a shift pair here because byte sized ANDs become shuffles. +define <4 x i32> @v4i32(<4 x i32> %x) { +; SSE-LABEL: v4i32: +; SSE: # %bb.0: +; SSE-NEXT: psrld $17, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrld $17, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq + %a = lshr <4 x i32> %x, + %b = and <4 x i32> %a, + ret <4 x i32> %b +} + +define <8 x i32> @v8i32(<8 x i32> %x) { +; SSE-LABEL: v8i32: +; SSE: # %bb.0: +; SSE-NEXT: pslld $4, %xmm0 +; SSE-NEXT: psrld $22, %xmm0 +; SSE-NEXT: pslld $4, %xmm1 +; SSE-NEXT: psrld $22, %xmm1 +; SSE-NEXT: retq +; +; AVX2-LABEL: v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $22, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrld $22, %ymm0, %ymm0 +; AVX512BW-NEXT: retq + %a = lshr <8 x i32> %x, + %b = and <8 x i32> %a, + ret <8 x i32> %b +} + +define <16 x i32> @v16i32(<16 x i32> %x) { +; SSE-LABEL: v16i32: +; SSE: # %bb.0: +; SSE-NEXT: pslld $1, %xmm0 +; SSE-NEXT: psrld $21, %xmm0 +; SSE-NEXT: pslld $1, %xmm1 +; SSE-NEXT: psrld $21, %xmm1 +; SSE-NEXT: pslld $1, %xmm2 +; SSE-NEXT: psrld $21, %xmm2 +; SSE-NEXT: pslld $1, %xmm3 +; SSE-NEXT: psrld $21, %xmm3 +; SSE-NEXT: retq +; +; AVX2-LABEL: v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $21, %ymm0, %ymm0 +; AVX2-NEXT: vpslld $1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $21, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v16i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $21, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %a = lshr <16 x i32> %x, + %b = and <16 x i32> %a, + ret <16 x i32> %b +} + +define <8 x i16> @v8i16(<8 x i16> %x) { +; SSE-LABEL: v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: psrlw $9, %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $9, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $9, %xmm0, %xmm0 +; AVX512BW-NEXT: retq + %a = lshr <8 x i16> %x, + %b = and <8 x i16> %a, + ret <8 x i16> %b +} + +define <16 x i16> @v16i16(<16 x i16> %x) { +; SSE-LABEL: v16i16: +; SSE: # %bb.0: +; SSE-NEXT: psllw $2, %xmm0 +; SSE-NEXT: psrlw $4, %xmm0 +; SSE-NEXT: psllw $2, %xmm1 +; SSE-NEXT: psrlw $4, %xmm1 +; SSE-NEXT: retq +; +; AVX2-LABEL: v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512BW-NEXT: retq + %a = lshr <16 x i16> %x, + %b = and <16 x i16> %a, + ret <16 x i16> %b +}