Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -26278,6 +26278,58 @@ return SDValue(); } + +// Match a binop + shuffle pyramid that represents a horizontal reduction over +// the elements of a vector. +// Returns the vector that is being reduced on, or SDValue() if a reduction +// was not matched. +static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) { + // The pattern must end in an extract from index 0. + if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) || + !isNullConstant(Extract->getOperand(1))) + return SDValue(); + + unsigned Stages = + Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements()); + + SDValue Op = Extract->getOperand(0); + // At each stage, we're looking for something that looks like: + // %s = shufflevector <8 x i32> %op, <8 x i32> undef, + // <8 x i32> + // %a = binop <8 x i32> %op, %s + // Where the mask changes according to the stage. E.g. for a 3-stage pyramid, + // we expect something like: + // <4,5,6,7,u,u,u,u> + // <2,3,u,u,u,u,u,u> + // <1,u,u,u,u,u,u,u> + for (unsigned i = 0; i < Stages; ++i) { + if (Op.getOpcode() != BinOp) + return SDValue(); + + ShuffleVectorSDNode *Shuffle = + dyn_cast(Op.getOperand(0).getNode()); + if (Shuffle) { + Op = Op.getOperand(1); + } else { + Shuffle = dyn_cast(Op.getOperand(1).getNode()); + Op = Op.getOperand(0); + } + + // The first operand of the shuffle should be the same as the other operand + // of the add. + if (!Shuffle || (Shuffle->getOperand(0) != Op)) + return SDValue(); + + // Verify the shuffle has the expected (at this stage of the pyramid) mask. + for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index) + if (Shuffle->getMaskElt(Index) != MaskEnd + Index) + return SDValue(); + } + + return Op; +} + // Given a select, detect the following pattern: // 1: %2 = zext %0 to // 2: %3 = zext %1 to @@ -26358,12 +26410,81 @@ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // PSADBW is only supported on SSE2 and up. + if (!Subtarget.hasSSE2()) + return SDValue(); + + // Verify the type we're extracting from is appropriate + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + EVT VT = Extract->getOperand(0).getValueType(); + if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + return SDValue(); + + unsigned RegSize = 128; + if (Subtarget.hasBWI()) + RegSize = 512; + else if (Subtarget.hasAVX2()) + RegSize = 256; + + // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // TODO: We should be able to handle larger vectors by splitting them before + // feeding them into several SADs, and then reducing over those. + if (VT.getSizeInBits() / 4 > RegSize) + return SDValue(); + + // Match shuffle + add pyramid. + SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + + // If there was a match, we want Root to be a select that is the root of an + // abs-diff pattern. + if (!Root || (Root.getOpcode() != ISD::VSELECT)) + return SDValue(); + + // Check whether we have an abs-diff pattern feeding into the select. + SDValue Zext0, Zext1; + if (!detectZextAbsDiff(Root, Zext0, Zext1)) + return SDValue(); + + // Create the SAD instruction + SDLoc DL(Extract); + SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); + + // If the original vector was wider than 8 elements, sum over the results + // in the SAD vector. + unsigned Stages = Log2_32(VT.getVectorNumElements()); + MVT SadVT = SAD.getSimpleValueType(); + if (Stages > 3) { + unsigned SadElems = SadVT.getVectorNumElements(); + + for(unsigned i = Stages - 3; i > 0; --i) { + SmallVector Mask(SadElems, -1); + for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) + Mask[j] = MaskEnd + j; + + SDValue Shuffle = + DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask); + SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle); + } + } + + + // Return the lowest i32. + MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + Extract->getOperand(1)); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading /// scalars back, while for x64 we should use 64-bit extracts and shifts. static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; @@ -26394,6 +26515,13 @@ uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } + + // Check whether this extract is the root of a sum of absolute differences + // pattern. This has to be done here because we really want it to happen + // pre-legalization, + if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) + return SAD; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) @@ -30730,6 +30858,8 @@ SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. if (!VT.isVector() || !VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) return SDValue(); @@ -30741,6 +30871,8 @@ RegSize = 256; // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // TODO: We should be able to handle larger vectors by splitting them before + // feeding them into several SADs, and then reducing over those. if (VT.getSizeInBits() / 4 > RegSize) return SDValue(); @@ -30978,7 +31110,8 @@ SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; - case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); Index: llvm/trunk/test/CodeGen/X86/sad.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sad.ll +++ llvm/trunk/test/CodeGen/X86/sad.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F @@ -999,3 +998,311 @@ ret i32 %12 } +define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 { +; SSE2-LABEL: sad_nonloop_4i8: +; SSE2: # BB#0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_4i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_nonloop_4i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_nonloop_4i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: retq + %v1 = load <4 x i8>, <4 x i8>* %p, align 1 + %z1 = zext <4 x i8> %v1 to <4 x i32> + %v2 = load <4 x i8>, <4 x i8>* %q, align 1 + %z2 = zext <4 x i8> %v2 to <4 x i32> + %sub = sub nsw <4 x i32> %z1, %z2 + %isneg = icmp sgt <4 x i32> %sub, + %neg = sub nsw <4 x i32> zeroinitializer, %sub + %abs = select <4 x i1> %isneg, <4 x i32> %sub, <4 x i32> %neg + %h2 = shufflevector <4 x i32> %abs, <4 x i32> undef, <4 x i32> + %sum2 = add <4 x i32> %abs, %h2 + %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> + %sum3 = add <4 x i32> %sum2, %h3 + %sum = extractelement <4 x i32> %sum3, i32 0 + ret i32 %sum +} + +define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* nocapture readonly %q) local_unnamed_addr #0 { +; SSE2-LABEL: sad_nonloop_8i8: +; SSE2: # BB#0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_8i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_nonloop_8i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_nonloop_8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: retq + %v1 = load <8 x i8>, <8 x i8>* %p, align 1 + %z1 = zext <8 x i8> %v1 to <8 x i32> + %v2 = load <8 x i8>, <8 x i8>* %q, align 1 + %z2 = zext <8 x i8> %v2 to <8 x i32> + %sub = sub nsw <8 x i32> %z1, %z2 + %isneg = icmp sgt <8 x i32> %sub, + %neg = sub nsw <8 x i32> zeroinitializer, %sub + %abs = select <8 x i1> %isneg, <8 x i32> %sub, <8 x i32> %neg + %h1 = shufflevector <8 x i32> %abs, <8 x i32> undef, <8 x i32> + %sum1 = add <8 x i32> %abs, %h1 + %h2 = shufflevector <8 x i32> %sum1, <8 x i32> undef, <8 x i32> + %sum2 = add <8 x i32> %sum1, %h2 + %h3 = shufflevector <8 x i32> %sum2, <8 x i32> undef, <8 x i32> + %sum3 = add <8 x i32> %sum2, %h3 + %sum = extractelement <8 x i32> %sum3, i32 0 + ret i32 %sum +} + +define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* nocapture readonly %q) local_unnamed_addr #0 { +; SSE2-LABEL: sad_nonloop_16i8: +; SSE2: # BB#0: +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_16i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_nonloop_16i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512F-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_nonloop_16i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BW-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: retq + %v1 = load <16 x i8>, <16 x i8>* %p, align 1 + %z1 = zext <16 x i8> %v1 to <16 x i32> + %v2 = load <16 x i8>, <16 x i8>* %q, align 1 + %z2 = zext <16 x i8> %v2 to <16 x i32> + %sub = sub nsw <16 x i32> %z1, %z2 + %isneg = icmp sgt <16 x i32> %sub, + %neg = sub nsw <16 x i32> zeroinitializer, %sub + %abs = select <16 x i1> %isneg, <16 x i32> %sub, <16 x i32> %neg + %h0 = shufflevector <16 x i32> %abs, <16 x i32> undef, <16 x i32> + %sum0 = add <16 x i32> %abs, %h0 + %h1 = shufflevector <16 x i32> %sum0, <16 x i32> undef, <16 x i32> + %sum1 = add <16 x i32> %sum0, %h1 + %h2 = shufflevector <16 x i32> %sum1, <16 x i32> undef, <16 x i32> + %sum2 = add <16 x i32> %sum1, %h2 + %h3 = shufflevector <16 x i32> %sum2, <16 x i32> undef, <16 x i32> + %sum3 = add <16 x i32> %sum2, %h3 + %sum = extractelement <16 x i32> %sum3, i32 0 + ret i32 %sum +} + +define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 { +; SSE2-LABEL: sad_nonloop_32i8: +; SSE2: # BB#0: +; SSE2-NEXT: movdqu (%rdi), %xmm12 +; SSE2-NEXT: movdqu 16(%rdi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,3,0,1] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm13, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,0,1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm12, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,0,1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm4, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm0, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: psubd %xmm4, %xmm1 +; SSE2-NEXT: psubd %xmm6, %xmm13 +; SSE2-NEXT: psubd %xmm8, %xmm11 +; SSE2-NEXT: psubd %xmm15, %xmm10 +; SSE2-NEXT: psubd %xmm14, %xmm3 +; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: pxor %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm11 +; SSE2-NEXT: pxor %xmm0, %xmm11 +; SSE2-NEXT: movdqa %xmm13, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: pxor %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm12, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: paddd %xmm10, %xmm3 +; SSE2-NEXT: paddd %xmm11, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_nonloop_32i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_nonloop_32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BW-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: retq + %v1 = load <32 x i8>, <32 x i8>* %p, align 1 + %z1 = zext <32 x i8> %v1 to <32 x i32> + %v2 = load <32 x i8>, <32 x i8>* %q, align 1 + %z2 = zext <32 x i8> %v2 to <32 x i32> + %sub = sub nsw <32 x i32> %z1, %z2 + %isneg = icmp sgt <32 x i32> %sub, + %neg = sub nsw <32 x i32> zeroinitializer, %sub + %abs = select <32 x i1> %isneg, <32 x i32> %sub, <32 x i32> %neg + %h32 = shufflevector <32 x i32> %abs, <32 x i32> undef, <32 x i32> + %sum32 = add <32 x i32> %abs, %h32 + %h0 = shufflevector <32 x i32> %sum32, <32 x i32> undef, <32 x i32> + %sum0 = add <32 x i32> %sum32, %h0 + %h1 = shufflevector <32 x i32> %sum0, <32 x i32> undef, <32 x i32> + %sum1 = add <32 x i32> %sum0, %h1 + %h2 = shufflevector <32 x i32> %sum1, <32 x i32> undef, <32 x i32> + %sum2 = add <32 x i32> %sum1, %h2 + %h3 = shufflevector <32 x i32> %sum2, <32 x i32> undef, <32 x i32> + %sum3 = add <32 x i32> %sum2, %h3 + %sum = extractelement <32 x i32> %sum3, i32 0 + ret i32 %sum +}