Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5627,6 +5627,34 @@ return DAG.getZExtOrTrunc(Setcc, DL, VT); } +/// For targets that support usubsat, match a bit-hack form of that operation +/// that ends in 'and' and convert it. +static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N1.getValueType(); + + // Canonicalize xor as operand 0. + if (N1.getOpcode() == ISD::XOR) + std::swap(N0, N1); + + if (N0.getOpcode() != ISD::XOR || N1.getOpcode() != ISD::SRA || + N0.getOperand(0) != N1.getOperand(0)) + return SDValue(); + + unsigned BitWidth = VT.getScalarSizeInBits(); + ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true); + ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true); + if (!XorC || !XorC->getAPIntValue().isSignMask() || + !SraC || SraC->getAPIntValue() != BitWidth - 1) + return SDValue(); + + // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128 + SDLoc DL(N); + SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT); + return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask); +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5989,6 +6017,10 @@ if (IsAndZeroExtMask(N0, N1)) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); + if (TLI.isOperationLegal(ISD::USUBSAT, VT)) + if (SDValue V = foldAndToUsubsat(N, DAG)) + return V; + return SDValue(); } Index: llvm/test/CodeGen/AMDGPU/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -86,26 +86,22 @@ ; GFX8-LABEL: usubsat_as_bithack_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: usubsat_as_bithack_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: usubsat_as_bithack_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i16 v1, 15, v0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX10-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = xor i16 %x, 32768 Index: llvm/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/test/CodeGen/X86/psubus.ll +++ llvm/test/CodeGen/X86/psubus.ll @@ -31,31 +31,13 @@ define <8 x i16> @ashr_xor_and(<8 x i16> %x) nounwind { ; SSE-LABEL: ashr_xor_and: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: ashr_xor_and: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: ashr_xor_and: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: ashr_xor_and: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX512-NEXT: vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: ashr_xor_and: +; AVX: # %bb.0: +; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %signsplat = ashr <8 x i16> %x, %flipsign = xor <8 x i16> %x, %res = and <8 x i16> %signsplat, %flipsign @@ -68,9 +50,10 @@ ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pcmpgtb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rdi) -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsi) +; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ashr_xor_and_commute_uses: @@ -78,9 +61,9 @@ ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rsi) +; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %signsplat = ashr <16 x i8> %x, store <16 x i8> %signsplat, <16 x i8>* %p1