diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8887,10 +8887,7 @@ } } - // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 - // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2 - if ((N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && - N0->getFlags().hasExact()) { + if (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) { auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { const APInt &LHSC = LHS->getAPIntValue(); @@ -8898,54 +8895,55 @@ return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && LHSC.getZExtValue() <= RHSC.getZExtValue(); }; - if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, - /*AllowUndefs*/ false, - /*AllowTypeMismatch*/ true)) { - SDLoc DL(N); - SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); - SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); - return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); - } - if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, - /*AllowUndefs*/ false, - /*AllowTypeMismatch*/ true)) { - SDLoc DL(N); - SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); - SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); - return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff); - } - } - ConstantSDNode *N1C = isConstOrConstSplat(N1); + SDLoc DL(N); - // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or - // (and (srl x, (sub c1, c2), MASK) - // Only fold this if the inner shift has no other uses -- if it does, folding - // this will increase the total number of instructions. - // TODO - drop hasOneUse requirement if c1 == c2? - // TODO - support non-uniform vector shift amounts. - if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && - TLI.shouldFoldConstantShiftPairToMask(N, Level)) { - if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { - if (N0C1->getAPIntValue().ult(OpSizeInBits)) { - uint64_t c1 = N0C1->getZExtValue(); - uint64_t c2 = N1C->getZExtValue(); - APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); - SDValue Shift; - if (c2 > c1) { - Mask <<= c2 - c1; - SDLoc DL(N); - Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), - DAG.getConstant(c2 - c1, DL, ShiftVT)); - } else { - Mask.lshrInPlace(c1 - c2); - SDLoc DL(N); - Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), - DAG.getConstant(c1 - c2, DL, ShiftVT)); - } - SDLoc DL(N0); - return DAG.getNode(ISD::AND, DL, VT, Shift, - DAG.getConstant(Mask, DL, VT)); + // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 + // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2 + if (N0->getFlags().hasExact()) { + if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); + return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); + } + if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); + return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff); + } + } + + // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or + // (and (srl x, (sub c1, c2), MASK) + // Only fold this if the inner shift has no other uses -- if it does, + // folding this will increase the total number of instructions. + // TODO - drop hasOneUse requirement if c1 == c2? + if (N0.getOpcode() == ISD::SRL && N0.hasOneUse() && + TLI.shouldFoldConstantShiftPairToMask(N, Level)) { + if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N01); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, Diff); + SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, SDLoc(N0), VT, Shift, Mask); + } + if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N1); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, SDLoc(N0), VT, Shift, Mask); } } } @@ -8984,6 +8982,7 @@ return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl); } + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && !N1C->isOpaque()) if (SDValue NewSHL = visitShiftByConstant(N)) return NewSHL; diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -368,9 +368,8 @@ ; SSE2-LABEL: combine_vec_shl_zext_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -379,22 +378,20 @@ ; SSE41-LABEL: combine_vec_shl_zext_lshr1: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_zext_lshr1: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: retq - %1 = lshr <8 x i16> %x, + %1 = lshr <8 x i16> %x, %2 = zext <8 x i16> %1 to <8 x i32> - %3 = shl <8 x i32> %2, + %3 = shl <8 x i32> %2, ret <8 x i32> %3 } @@ -509,8 +506,8 @@ ; ; AVX-LABEL: combine_vec_shl_gt_lshr0: ; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, @@ -519,46 +516,19 @@ } define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) { -; SSE2-LABEL: combine_vec_shl_gt_lshr1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $3, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $8, %xmm1 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: combine_vec_shl_gt_lshr1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $8, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld $4, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $5, %xmm1 -; SSE41-NEXT: psrld $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: combine_vec_shl_gt_lshr1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $2, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_gt_lshr1: ; AVX: # %bb.0: -; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpslld $2, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq - %1 = lshr <4 x i32> %x, - %2 = shl <4 x i32> %1, + %1 = lshr <4 x i32> %x, + %2 = shl <4 x i32> %1, ret <4 x i32> %2 } @@ -572,8 +542,8 @@ ; ; AVX-LABEL: combine_vec_shl_le_lshr0: ; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, @@ -585,40 +555,25 @@ ; SSE2-LABEL: combine_vec_shl_le_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $5, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $7, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $8, %xmm1 -; SSE2-NEXT: psrld $6, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrld $2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_le_lshr1: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $8, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld $6, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $7, %xmm1 -; SSE41-NEXT: psrld $5, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: psrld $2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_le_lshr1: ; AVX: # %bb.0: ; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -172,9 +172,9 @@ define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { ; CHECK-LABEL: no_extract_shrl: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840] -; CHECK-NEXT: vpslld $25, %xmm0, %xmm2 -; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vpslld $25, %xmm0, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4026531840,4026531840,4026531840,4026531840] +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}}