diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5627,6 +5627,35 @@ return DAG.getZExtOrTrunc(Setcc, DL, VT); } +/// For targets that support usubsat, match a bit-hack form of that operation +/// that ends in 'and' and convert it. +static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N1.getValueType(); + + // Canonicalize xor as operand 0. + if (N1.getOpcode() == ISD::XOR) + std::swap(N0, N1); + + if (N0.getOpcode() != ISD::XOR || N1.getOpcode() != ISD::SRA || + !N0.hasOneUse() || !N1.hasOneUse() || + N0.getOperand(0) != N1.getOperand(0)) + return SDValue(); + + unsigned BitWidth = VT.getScalarSizeInBits(); + ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true); + ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true); + if (!XorC || !XorC->getAPIntValue().isSignMask() || + !SraC || SraC->getAPIntValue() != BitWidth - 1) + return SDValue(); + + // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128 + SDLoc DL(N); + SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT); + return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask); +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5989,6 +6018,10 @@ if (IsAndZeroExtMask(N0, N1)) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); + if (hasOperation(ISD::USUBSAT, VT)) + if (SDValue V = foldAndToUsubsat(N, DAG)) + return V; + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -86,26 +86,22 @@ ; GFX8-LABEL: usubsat_as_bithack_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: usubsat_as_bithack_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: usubsat_as_bithack_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i16 v1, 15, v0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX10-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = xor i16 %x, 32768 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -34,37 +34,21 @@ define <8 x i16> @ashr_xor_and(<8 x i16> %x) nounwind { ; SSE-LABEL: ashr_xor_and: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: ashr_xor_and: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: ashr_xor_and: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: ashr_xor_and: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX512-NEXT: vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: ashr_xor_and: +; AVX: # %bb.0: +; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %signsplat = ashr <8 x i16> %x, %flipsign = xor <8 x i16> %x, %res = and <8 x i16> %signsplat, %flipsign ret <8 x i16> %res } +; negative test - extra uses may lead to extra instructions when custom-lowered + define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, <16 x i8>* %p1, <16 x i8>* %p2) nounwind { ; SSE-LABEL: ashr_xor_and_commute_uses: ; SSE: # %bb.0: @@ -94,27 +78,33 @@ } define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind { -; SSE-LABEL: ashr_xor_and_custom: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2OR3-LABEL: ashr_xor_and_custom: +; SSE2OR3: # %bb.0: +; SSE2OR3-NEXT: movdqa %xmm0, %xmm1 +; SSE2OR3-NEXT: psrad $31, %xmm1 +; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2OR3-NEXT: pand %xmm1, %xmm0 +; SSE2OR3-NEXT: retq +; +; SSE41-LABEL: ashr_xor_and_custom: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: ashr_xor_and_custom: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ashr_xor_and_custom: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: ashr_xor_and_custom: @@ -349,37 +339,28 @@ define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind { ; SSE-LABEL: ashr_xor_and_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psraw $15, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psraw $15, %xmm3 -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: psubusw %xmm2, %xmm0 +; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: ashr_xor_and_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ashr_xor_and_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: ashr_xor_and_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %signsplat = ashr <16 x i16> %x, %flipsign = xor <16 x i16> %x,