Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26409,8 +26409,7 @@ // look for psign/blend if (VT == MVT::v2i64 || VT == MVT::v4i64) { - if (!Subtarget.hasSSSE3() || - (VT == MVT::v4i64 && !Subtarget.hasInt256())) + if (VT == MVT::v4i64 && !Subtarget.hasInt256()) return SDValue(); // Canonicalize pandn to RHS @@ -26446,11 +26445,14 @@ // there is no psrai.b unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; + unsigned SRLOpc; if (Mask.getOpcode() == ISD::SRA) { + SRLOpc = ISD::SRL; if (auto *AmtBV = dyn_cast(Mask.getOperand(1))) if (auto *AmtConst = AmtBV->getConstantSplatNode()) SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { + SRLOpc = X86ISD::VSRLI; SDValue SraC = Mask.getOperand(1); SraAmt = cast(SraC)->getZExtValue(); } @@ -26459,15 +26461,30 @@ SDLoc DL(N); - // Now we know we at least have a plendvb with the mask val. See if - // we can form a psignb/w/d. - // psign = x.type == y.type == mask.type && y = sub(0, x); + // Try to match: + // (or (and (M, (sub 0, X)), (pandn M, X))) + // which is a special case of vselect: + // (vselect M, (sub 0, X), X) if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); - Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + // Per: + // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate + // We know that, if fNegate is 0 or 1: + // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) + // + // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: + // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) + // ( M ? -X : X) == ((X ^ M ) + (M & 1)) + // This lets us transform our vselect to: + // (add (xor X, M), (and M, 1)) + // And further to: + // (add (xor X, M), (srl M, EltBits-1)) + Mask = DAG.getNode( + ISD::ADD, DL, MaskVT, DAG.getNode(ISD::XOR, DL, MaskVT, X, Mask), + DAG.getNode(SRLOpc, DL, MaskVT, Mask, Mask.getOperand(1))); return DAG.getBitcast(VT, Mask); } // PBLENDVB only available on SSE 4.1 Index: test/CodeGen/X86/avx2-logic.ll =================================================================== --- test/CodeGen/X86/avx2-logic.ll +++ test/CodeGen/X86/avx2-logic.ll @@ -72,7 +72,10 @@ define <8 x i32> @signd(<8 x i32> %a, <8 x i32> %b) nounwind { ; CHECK-LABEL: signd: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrad $31, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %b.lobit = ashr <8 x i32> %b, Index: test/CodeGen/X86/vec-sign.ll =================================================================== --- test/CodeGen/X86/vec-sign.ll +++ test/CodeGen/X86/vec-sign.ll @@ -3,21 +3,13 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41 define <4 x i32> @signd(<4 x i32> %a, <4 x i32> %b) nounwind { -; SSE2-LABEL: signd: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: signd: -; SSE41: # BB#0: # %entry -; SSE41-NEXT: psignd %xmm1, %xmm0 -; SSE41-NEXT: retq +; ALL-LABEL: signd: +; ALL: # BB#0: # %entry +; ALL-NEXT: psrad $31, %xmm1 +; ALL-NEXT: pxor %xmm1, %xmm0 +; ALL-NEXT: psrld $31, %xmm1 +; ALL-NEXT: paddd %xmm1, %xmm0 +; ALL-NEXT: retq entry: %b.lobit = ashr <4 x i32> %b, %sub = sub nsw <4 x i32> zeroinitializer, %a