diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -824,6 +824,14 @@ // Return true if the target wants to transform Op(Splat(X)) -> Splat(Op(X)) virtual bool preferScalarizeSplat(SDNode *N) const { return true; } + // Return true if the target wants to transform: + // (TruncVT truncate(sext_in_reg(VT X, ExtVT)) + // -> (TruncVT sext_in_reg(truncate(VT X), ExtVT)) + // Some targets might prefer pre-sextinreg to improve truncation/saturation. + virtual bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const { + return true; + } + /// Return true if the target wants to use the optimization that /// turns ext(promotableInst1(...(promotableInstN(load)))) into /// promotedInst1(...(promotedInstN(ext(load)))). diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10136,6 +10136,7 @@ // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. + // TODO: Remove this and use SimplifyDemandedBits instead. ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); @@ -14374,7 +14375,7 @@ SDValue X = N0.getOperand(0); SDValue ExtVal = N0.getOperand(1); EVT ExtVT = cast(ExtVal)->getVT(); - if (ExtVT.bitsLT(VT)) { + if (ExtVT.bitsLT(VT) && TLI.preferSextInRegOfTruncate(VT, SrcVT, ExtVT)) { SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X); return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1945,6 +1945,35 @@ if (ShAmt == 0) return TLO.CombineTo(Op, Op0); + // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target + // supports sext_inreg. + if (Op0.getOpcode() == ISD::SHL) { + if (const APInt *InnerSA = + TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) { + unsigned LowBits = BitWidth - ShAmt; + EVT ExtVT = EVT::getIntegerVT(*TLO.DAG.getContext(), LowBits); + if (VT.isVector()) + ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtVT, + VT.getVectorElementCount()); + + if (*InnerSA == ShAmt) { + if (!TLO.LegalOperations() || + isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)) + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, + Op0.getOperand(0), + TLO.DAG.getValueType(ExtVT))); + + // Even if we can't convert to sext_inreg, we might be able to + // remove this shift pair if the input is already sign extended. + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op0.getOperand(0), DemandedElts); + if (NumSignBits > ShAmt) + return TLO.CombineTo(Op, Op0.getOperand(0)); + } + } + } + APInt InDemandedMask = (DemandedBits << ShAmt); // If the shift is exact, then it does demand the low bits (and knows that diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1054,6 +1054,9 @@ bool preferABDSToABSWithNSW(EVT VT) const override; + bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, + EVT ExtVT) const override; + /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -55982,6 +55982,12 @@ return false; } +// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS. +bool X86TargetLowering::preferSextInRegOfTruncate(EVT TruncVT, EVT VT, + EVT ExtVT) const { + return Subtarget.hasAVX512() || !VT.isVector(); +} + bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX,X86-AVX1 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX,X86-AVX2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX,X64-AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind { ; SSE2-LABEL: trunc_ashr_v4i64: @@ -175,107 +175,55 @@ } define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) { -; X86-SSE-LABEL: trunc_ashr_v4i64_demandedelts: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: psllq $63, %xmm1 -; X86-SSE-NEXT: psllq $63, %xmm0 -; X86-SSE-NEXT: psrlq $63, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,2147483648] -; X86-SSE-NEXT: pxor %xmm2, %xmm0 -; X86-SSE-NEXT: psubq %xmm2, %xmm0 -; X86-SSE-NEXT: psrlq $63, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm1 -; X86-SSE-NEXT: psubq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE-NEXT: packssdw %xmm1, %xmm0 -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: trunc_ashr_v4i64_demandedelts: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,0,1,0] -; X86-AVX1-NEXT: # xmm2 = mem[0,0] -; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vzeroupper -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: trunc_ashr_v4i64_demandedelts: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0] -; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,0,2147483648,1,0,0,2147483648] -; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl +; SSE2-LABEL: trunc_ashr_v4i64_demandedelts: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psllq $63, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: ret{{[l|q]}} ; -; X64-SSE-LABEL: trunc_ashr_v4i64_demandedelts: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: psllq $63, %xmm1 -; X64-SSE-NEXT: psllq $63, %xmm0 -; X64-SSE-NEXT: psrlq $63, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,9223372036854775808] -; X64-SSE-NEXT: pxor %xmm2, %xmm0 -; X64-SSE-NEXT: psubq %xmm2, %xmm0 -; X64-SSE-NEXT: psrlq $63, %xmm1 -; X64-SSE-NEXT: pxor %xmm2, %xmm1 -; X64-SSE-NEXT: psubq %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-SSE-NEXT: packssdw %xmm1, %xmm0 -; X64-SSE-NEXT: retq +; SSE4-LABEL: trunc_ashr_v4i64_demandedelts: +; SSE4: # %bb.0: +; SSE4-NEXT: psllq $63, %xmm0 +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pxor %xmm3, %xmm3 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm3 +; SSE4-NEXT: psllq $63, %xmm1 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE4-NEXT: packssdw %xmm1, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} ; -; X64-AVX1-LABEL: trunc_ashr_v4i64_demandedelts: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,9223372036854775808] -; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vzeroupper -; X64-AVX1-NEXT: retq +; AVX1-LABEL: trunc_ashr_v4i64_demandedelts: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: ret{{[l|q]}} ; -; X64-AVX2-LABEL: trunc_ashr_v4i64_demandedelts: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9223372036854775808,1,9223372036854775808] -; X64-AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; X64-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; AVX2-LABEL: trunc_ashr_v4i64_demandedelts: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: ret{{[l|q]}} %1 = shl <4 x i64> %a0, %2 = ashr <4 x i64> %1, %3 = bitcast <4 x i64> %2 to <8 x i32>