diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5633,12 +5633,15 @@ SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); - // Canonicalize xor as operand 0. - if (N1.getOpcode() == ISD::XOR) + // Canonicalize SRA as operand 1. + if (N0.getOpcode() == ISD::SRA) std::swap(N0, N1); - if (N0.getOpcode() != ISD::XOR || N1.getOpcode() != ISD::SRA || - !N0.hasOneUse() || !N1.hasOneUse() || + // xor/add with SMIN (signmask) are logically equivalent. + if (N0.getOpcode() != ISD::XOR && N0.getOpcode() != ISD::ADD) + return SDValue(); + + if (N1.getOpcode() != ISD::SRA || !N0.hasOneUse() || !N1.hasOneUse() || N0.getOperand(0) != N1.getOperand(0)) return SDValue(); @@ -5650,6 +5653,7 @@ return SDValue(); // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128 + // (i8 X + 128) & (i8 X s>> 7) --> usubsat X, 128 SDLoc DL(N); SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT); return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask); diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -122,26 +122,22 @@ ; GFX8-LABEL: usubsat_as_bithack2_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX8-NEXT: v_add_u16_e32 v0, 0x8000, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: usubsat_as_bithack2_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: usubsat_as_bithack2_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i16 v1, 15, v0 -; GFX10-NEXT: v_add_nc_u16 v0, 0x8000, v0 -; GFX10-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 @@ -162,26 +158,22 @@ ; GFX8-LABEL: usubsat_as_bithack_commute_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX8-NEXT: v_add_u16_e32 v0, 0x8000, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: usubsat_as_bithack_commute_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: usubsat_as_bithack_commute_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i16 v1, 15, v0 -; GFX10-NEXT: v_add_nc_u16 v0, 0x8000, v0 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -50,17 +50,12 @@ define <8 x i16> @ashr_add_and(<8 x i16> %x) nounwind { ; SSE-LABEL: ashr_add_and: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ashr_add_and: ; AVX: # %bb.0: -; AVX-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %signsplat = ashr <8 x i16> %x, %flipsign = add <8 x i16> %x, @@ -140,34 +135,39 @@ } define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind { -; SSE-LABEL: ashr_add_and_custom: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2OR3-LABEL: ashr_add_and_custom: +; SSE2OR3: # %bb.0: +; SSE2OR3-NEXT: movdqa %xmm0, %xmm1 +; SSE2OR3-NEXT: psrad $31, %xmm1 +; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2OR3-NEXT: pand %xmm1, %xmm0 +; SSE2OR3-NEXT: retq +; +; SSE41-LABEL: ashr_add_and_custom: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: ashr_add_and_custom: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ashr_add_and_custom: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: ashr_add_and_custom: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512-NEXT: retq %signsplat = ashr <4 x i32> %x, %flipsign = add <4 x i32> %x, @@ -428,40 +428,28 @@ define <16 x i16> @ashr_add_and_v16i16(<16 x i16> %x) nounwind { ; SSE-LABEL: ashr_add_and_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psraw $15, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psraw $15, %xmm3 -; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: psubusw %xmm2, %xmm0 +; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: ashr_add_and_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ashr_add_and_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1 -; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: ashr_add_and_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %ymm0, %ymm1 -; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %signsplat = ashr <16 x i16> %x, %flipsign = add <16 x i16> %x,