Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29036,12 +29036,18 @@ if (SetCC.getOpcode() != ISD::SETCC) return false; ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT) + if (CC != ISD::SETGT && CC != ISD::SETLT) return false; SDValue SelectOp1 = Select->getOperand(1); SDValue SelectOp2 = Select->getOperand(2); + // The following instrcutions assume SelectOp1 is the subtraction operand + // and SelectOp2 is the negation operand. + // In the case of SETLT this is the other way around. + if (CC == ISD::SETLT) + std::swap(SelectOp1, SelectOp2); + // The second operand of the select should be the negation of the first // operand, which is implemented as 0 - SelectOp1. if (!(SelectOp2.getOpcode() == ISD::SUB && @@ -29054,8 +29060,17 @@ if (SetCC.getOperand(0) != SelectOp1) return false; - // The second operand of the comparison can be either -1 or 0. - if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + // In SetLT case, The second operand of the comparison can be either 1 or 0. + APInt SplatVal; + if ((CC == ISD::SETLT) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && + SplatVal == 1) || + (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) + return false; + + // In SetGT case, The second operand of the comparison can be either -1 or 0. + if ((CC == ISD::SETGT) && + !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) return false; @@ -29184,11 +29199,9 @@ if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is appropriate - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. + // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; @@ -29197,15 +29210,22 @@ else if (Subtarget.hasAVX2()) RegSize = 256; - // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) + if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + // The operand is expected to be zero extended from i8 + // (verified in detectZextAbsDiff). As a result the sign extend is + // basically zero extend (extends the sign bit which is zero). + // So it is correct to skip the sign extend instruction. + if (Root && Root.getOpcode() == ISD::SIGN_EXTEND) + Root = Root.getOperand(0); + // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || (Root.getOpcode() != ISD::VSELECT)) @@ -29216,7 +29236,7 @@ if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); - // Create the SAD instruction + // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); @@ -29238,10 +29258,12 @@ } } - // Return the lowest i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + MVT Type = Extract->getSimpleValueType(0); + unsigned TypeSizeInBits = Type.getSizeInBits(); + // Return the lowest TypeSizeInBits bits. + MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } Index: test/CodeGen/X86/sad_variations.ll =================================================================== --- test/CodeGen/X86/sad_variations.ll +++ test/CodeGen/X86/sad_variations.ll @@ -0,0 +1,1337 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F + +define i32 @sad8_64bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 { +; SSE2-LABEL: sad8_64bit_icmp_sge: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: addl %ecx, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_64bit_icmp_sge: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_64bit_icmp_sge: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: movl %edx, %eax +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: retq + +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp sgt <8 x i32> %6, + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext + %add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext + %11 = bitcast i8* %add.ptr to <8 x i8>* + %12 = load <8 x i8>, <8 x i8>* %11, align 1 + %13 = zext <8 x i8> %12 to <8 x i32> + %14 = bitcast i8* %add.ptr178 to <8 x i8>* + %15 = load <8 x i8>, <8 x i8>* %14, align 1 + %16 = zext <8 x i8> %15 to <8 x i32> + %17 = sub nsw <8 x i32> %13, %16 + %18 = icmp sgt <8 x i32> %17, + %19 = sub nsw <8 x i32> zeroinitializer, %17 + %20 = select <8 x i1> %18, <8 x i32> %17, <8 x i32> %19 + %rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> + %bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1 + %rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> + %bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1 + %rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> + %bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1 + %21 = extractelement <8 x i32> %bin.rdx232.1, i32 0 + %bin.extra.1 = add i32 %21, %10 + %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext + %add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext + %22 = bitcast i8* %add.ptr.1 to <8 x i8>* + %23 = load <8 x i8>, <8 x i8>* %22 + %24 = zext <8 x i8> %23 to <8 x i32> + %25 = bitcast i8* %add.ptr178.1 to <8 x i8>* + %26 = load <8 x i8>, <8 x i8>* %25 + %27 = zext <8 x i8> %26 to <8 x i32> + %28 = sub nsw <8 x i32> %24, %27 + %29 = icmp sgt <8 x i32> %28, + %30 = sub nsw <8 x i32> zeroinitializer, %28 + %31 = select <8 x i1> %29, <8 x i32> %28, <8 x i32> %30 + %rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> + %bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2 + %rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> + %bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2 + %rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> + %bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2 + %32 = extractelement <8 x i32> %bin.rdx232.2, i32 0 + %bin.extra.2 = add i32 %32, %bin.extra.1 + %add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext + %add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext + %33 = bitcast i8* %add.ptr.2 to <8 x i8>* + %34 = load <8 x i8>, <8 x i8>* %33, align 1 + %35 = zext <8 x i8> %34 to <8 x i32> + %36 = bitcast i8* %add.ptr178.2 to <8 x i8>* + %37 = load <8 x i8>, <8 x i8>* %36, align 1 + %38 = zext <8 x i8> %37 to <8 x i32> + %39 = sub nsw <8 x i32> %35, %38 + %40 = icmp sgt <8 x i32> %39, + %41 = sub nsw <8 x i32> zeroinitializer, %39 + %42 = select <8 x i1> %40, <8 x i32> %39, <8 x i32> %41 + %rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> + %bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3 + %rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> + %bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3 + %rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> + %bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3 + %43 = extractelement <8 x i32> %bin.rdx232.3, i32 0 + %bin.extra.3 = add i32 %43, %bin.extra.2 + %add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext + %add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext + %44 = bitcast i8* %add.ptr.3 to <8 x i8>* + %45 = load <8 x i8>, <8 x i8>* %44, align 1 + %46 = zext <8 x i8> %45 to <8 x i32> + %47 = bitcast i8* %add.ptr178.3 to <8 x i8>* + %48 = load <8 x i8>, <8 x i8>* %47, align 1 + %49 = zext <8 x i8> %48 to <8 x i32> + %50 = sub nsw <8 x i32> %46, %49 + %51 = icmp sgt <8 x i32> %50, + %52 = sub nsw <8 x i32> zeroinitializer, %50 + %53 = select <8 x i1> %51, <8 x i32> %50, <8 x i32> %52 + %rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> + %bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4 + %rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> + %bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4 + %rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> + %bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4 + %54 = extractelement <8 x i32> %bin.rdx232.4, i32 0 + %bin.extra.4 = add i32 %54, %bin.extra.3 + %add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext + %add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext + %55 = bitcast i8* %add.ptr.4 to <8 x i8>* + %56 = load <8 x i8>, <8 x i8>* %55, align 1 + %57 = zext <8 x i8> %56 to <8 x i32> + %58 = bitcast i8* %add.ptr178.4 to <8 x i8>* + %59 = load <8 x i8>, <8 x i8>* %58, align 1 + %60 = zext <8 x i8> %59 to <8 x i32> + %61 = sub nsw <8 x i32> %57, %60 + %62 = icmp sgt <8 x i32> %61, + %63 = sub nsw <8 x i32> zeroinitializer, %61 + %64 = select <8 x i1> %62, <8 x i32> %61, <8 x i32> %63 + %rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> + %bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5 + %rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> + %bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5 + %rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> + %bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5 + %65 = extractelement <8 x i32> %bin.rdx232.5, i32 0 + %bin.extra.5 = add i32 %65, %bin.extra.4 + %add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext + %add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext + %66 = bitcast i8* %add.ptr.5 to <8 x i8>* + %67 = load <8 x i8>, <8 x i8>* %66, align 1 + %68 = zext <8 x i8> %67 to <8 x i32> + %69 = bitcast i8* %add.ptr178.5 to <8 x i8>* + %70 = load <8 x i8>, <8 x i8>* %69, align 1 + %71 = zext <8 x i8> %70 to <8 x i32> + %72 = sub nsw <8 x i32> %68, %71 + %73 = icmp sgt <8 x i32> %72, + %74 = sub nsw <8 x i32> zeroinitializer, %72 + %75 = select <8 x i1> %73, <8 x i32> %72, <8 x i32> %74 + %rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> + %bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6 + %rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> + %bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6 + %rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> + %bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6 + %76 = extractelement <8 x i32> %bin.rdx232.6, i32 0 + %bin.extra.6 = add i32 %76, %bin.extra.5 + %add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext + %add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext + %77 = bitcast i8* %add.ptr.6 to <8 x i8>* + %78 = load <8 x i8>, <8 x i8>* %77, align 1 + %79 = zext <8 x i8> %78 to <8 x i32> + %80 = bitcast i8* %add.ptr178.6 to <8 x i8>* + %81 = load <8 x i8>, <8 x i8>* %80, align 1 + %82 = zext <8 x i8> %81 to <8 x i32> + %83 = sub nsw <8 x i32> %79, %82 + %84 = icmp sgt <8 x i32> %83, + %85 = sub nsw <8 x i32> zeroinitializer, %83 + %86 = select <8 x i1> %84, <8 x i32> %83, <8 x i32> %85 + %rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> + %bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7 + %rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> + %bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7 + %rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> + %bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7 + %87 = extractelement <8 x i32> %bin.rdx232.7, i32 0 + %bin.extra.7 = add i32 %87, %bin.extra.6 + ret i32 %bin.extra.7 +} + +define i32 @sad8_64bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 { +; SSE2-LABEL: sad8_64bit_icmp_sgt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: addl %ecx, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_64bit_icmp_sgt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_64bit_icmp_sgt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: movl %edx, %eax +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: retq +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp sgt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext + %add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext + %11 = bitcast i8* %add.ptr to <8 x i8>* + %12 = load <8 x i8>, <8 x i8>* %11, align 1 + %13 = zext <8 x i8> %12 to <8 x i32> + %14 = bitcast i8* %add.ptr178 to <8 x i8>* + %15 = load <8 x i8>, <8 x i8>* %14, align 1 + %16 = zext <8 x i8> %15 to <8 x i32> + %17 = sub nsw <8 x i32> %13, %16 + %18 = icmp sgt <8 x i32> %17, zeroinitializer + %19 = sub nsw <8 x i32> zeroinitializer, %17 + %20 = select <8 x i1> %18, <8 x i32> %17, <8 x i32> %19 + %rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> + %bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1 + %rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> + %bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1 + %rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> + %bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1 + %21 = extractelement <8 x i32> %bin.rdx232.1, i32 0 + %bin.extra.1 = add i32 %21, %10 + %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext + %add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext + %22 = bitcast i8* %add.ptr.1 to <8 x i8>* + %23 = load <8 x i8>, <8 x i8>* %22, align 1 + %24 = zext <8 x i8> %23 to <8 x i32> + %25 = bitcast i8* %add.ptr178.1 to <8 x i8>* + %26 = load <8 x i8>, <8 x i8>* %25, align 1 + %27 = zext <8 x i8> %26 to <8 x i32> + %28 = sub nsw <8 x i32> %24, %27 + %29 = icmp sgt <8 x i32> %28, zeroinitializer + %30 = sub nsw <8 x i32> zeroinitializer, %28 + %31 = select <8 x i1> %29, <8 x i32> %28, <8 x i32> %30 + %rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> + %bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2 + %rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> + %bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2 + %rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> + %bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2 + %32 = extractelement <8 x i32> %bin.rdx232.2, i32 0 + %bin.extra.2 = add i32 %32, %bin.extra.1 + %add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext + %add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext + %33 = bitcast i8* %add.ptr.2 to <8 x i8>* + %34 = load <8 x i8>, <8 x i8>* %33, align 1 + %35 = zext <8 x i8> %34 to <8 x i32> + %36 = bitcast i8* %add.ptr178.2 to <8 x i8>* + %37 = load <8 x i8>, <8 x i8>* %36, align 1 + %38 = zext <8 x i8> %37 to <8 x i32> + %39 = sub nsw <8 x i32> %35, %38 + %40 = icmp sgt <8 x i32> %39, zeroinitializer + %41 = sub nsw <8 x i32> zeroinitializer, %39 + %42 = select <8 x i1> %40, <8 x i32> %39, <8 x i32> %41 + %rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> + %bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3 + %rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> + %bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3 + %rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> + %bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3 + %43 = extractelement <8 x i32> %bin.rdx232.3, i32 0 + %bin.extra.3 = add i32 %43, %bin.extra.2 + %add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext + %add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext + %44 = bitcast i8* %add.ptr.3 to <8 x i8>* + %45 = load <8 x i8>, <8 x i8>* %44, align 1 + %46 = zext <8 x i8> %45 to <8 x i32> + %47 = bitcast i8* %add.ptr178.3 to <8 x i8>* + %48 = load <8 x i8>, <8 x i8>* %47, align 1 + %49 = zext <8 x i8> %48 to <8 x i32> + %50 = sub nsw <8 x i32> %46, %49 + %51 = icmp sgt <8 x i32> %50, zeroinitializer + %52 = sub nsw <8 x i32> zeroinitializer, %50 + %53 = select <8 x i1> %51, <8 x i32> %50, <8 x i32> %52 + %rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> + %bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4 + %rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> + %bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4 + %rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> + %bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4 + %54 = extractelement <8 x i32> %bin.rdx232.4, i32 0 + %bin.extra.4 = add i32 %54, %bin.extra.3 + %add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext + %add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext + %55 = bitcast i8* %add.ptr.4 to <8 x i8>* + %56 = load <8 x i8>, <8 x i8>* %55, align 1 + %57 = zext <8 x i8> %56 to <8 x i32> + %58 = bitcast i8* %add.ptr178.4 to <8 x i8>* + %59 = load <8 x i8>, <8 x i8>* %58, align 1 + %60 = zext <8 x i8> %59 to <8 x i32> + %61 = sub nsw <8 x i32> %57, %60 + %62 = icmp sgt <8 x i32> %61, zeroinitializer + %63 = sub nsw <8 x i32> zeroinitializer, %61 + %64 = select <8 x i1> %62, <8 x i32> %61, <8 x i32> %63 + %rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> + %bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5 + %rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> + %bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5 + %rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> + %bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5 + %65 = extractelement <8 x i32> %bin.rdx232.5, i32 0 + %bin.extra.5 = add i32 %65, %bin.extra.4 + %add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext + %add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext + %66 = bitcast i8* %add.ptr.5 to <8 x i8>* + %67 = load <8 x i8>, <8 x i8>* %66, align 1 + %68 = zext <8 x i8> %67 to <8 x i32> + %69 = bitcast i8* %add.ptr178.5 to <8 x i8>* + %70 = load <8 x i8>, <8 x i8>* %69, align 1 + %71 = zext <8 x i8> %70 to <8 x i32> + %72 = sub nsw <8 x i32> %68, %71 + %73 = icmp sgt <8 x i32> %72, zeroinitializer + %74 = sub nsw <8 x i32> zeroinitializer, %72 + %75 = select <8 x i1> %73, <8 x i32> %72, <8 x i32> %74 + %rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> + %bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6 + %rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> + %bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6 + %rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> + %bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6 + %76 = extractelement <8 x i32> %bin.rdx232.6, i32 0 + %bin.extra.6 = add i32 %76, %bin.extra.5 + %add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext + %add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext + %77 = bitcast i8* %add.ptr.6 to <8 x i8>* + %78 = load <8 x i8>, <8 x i8>* %77, align 1 + %79 = zext <8 x i8> %78 to <8 x i32> + %80 = bitcast i8* %add.ptr178.6 to <8 x i8>* + %81 = load <8 x i8>, <8 x i8>* %80, align 1 + %82 = zext <8 x i8> %81 to <8 x i32> + %83 = sub nsw <8 x i32> %79, %82 + %84 = icmp sgt <8 x i32> %83, zeroinitializer + %85 = sub nsw <8 x i32> zeroinitializer, %83 + %86 = select <8 x i1> %84, <8 x i32> %83, <8 x i32> %85 + %rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> + %bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7 + %rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> + %bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7 + %rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> + %bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7 + %87 = extractelement <8 x i32> %bin.rdx232.7, i32 0 + %bin.extra.7 = add i32 %87, %bin.extra.6 + ret i32 %bin.extra.7 +} + +define i32 @sad8_64bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 { +; SSE2-LABEL: sad8_64bit_icmp_sle: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: addl %ecx, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_64bit_icmp_sle: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_64bit_icmp_sle: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: movl %edx, %eax +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: retq +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext + %add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext + %11 = bitcast i8* %add.ptr to <8 x i8>* + %12 = load <8 x i8>, <8 x i8>* %11, align 1 + %13 = zext <8 x i8> %12 to <8 x i32> + %14 = bitcast i8* %add.ptr178 to <8 x i8>* + %15 = load <8 x i8>, <8 x i8>* %14, align 1 + %16 = zext <8 x i8> %15 to <8 x i32> + %17 = sub nsw <8 x i32> %13, %16 + %18 = icmp slt <8 x i32> %17, + %19 = sub nsw <8 x i32> zeroinitializer, %17 + %20 = select <8 x i1> %18, <8 x i32> %19, <8 x i32> %17 + %rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> + %bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1 + %rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> + %bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1 + %rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> + %bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1 + %21 = extractelement <8 x i32> %bin.rdx232.1, i32 0 + %bin.extra.1 = add i32 %21, %10 + %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext + %add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext + %22 = bitcast i8* %add.ptr.1 to <8 x i8>* + %23 = load <8 x i8>, <8 x i8>* %22, align 1 + %24 = zext <8 x i8> %23 to <8 x i32> + %25 = bitcast i8* %add.ptr178.1 to <8 x i8>* + %26 = load <8 x i8>, <8 x i8>* %25, align 1 + %27 = zext <8 x i8> %26 to <8 x i32> + %28 = sub nsw <8 x i32> %24, %27 + %29 = icmp slt <8 x i32> %28, + %30 = sub nsw <8 x i32> zeroinitializer, %28 + %31 = select <8 x i1> %29, <8 x i32> %30, <8 x i32> %28 + %rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> + %bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2 + %rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> + %bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2 + %rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> + %bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2 + %32 = extractelement <8 x i32> %bin.rdx232.2, i32 0 + %bin.extra.2 = add i32 %32, %bin.extra.1 + %add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext + %add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext + %33 = bitcast i8* %add.ptr.2 to <8 x i8>* + %34 = load <8 x i8>, <8 x i8>* %33, align 1 + %35 = zext <8 x i8> %34 to <8 x i32> + %36 = bitcast i8* %add.ptr178.2 to <8 x i8>* + %37 = load <8 x i8>, <8 x i8>* %36, align 1 + %38 = zext <8 x i8> %37 to <8 x i32> + %39 = sub nsw <8 x i32> %35, %38 + %40 = icmp slt <8 x i32> %39, + %41 = sub nsw <8 x i32> zeroinitializer, %39 + %42 = select <8 x i1> %40, <8 x i32> %41, <8 x i32> %39 + %rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> + %bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3 + %rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> + %bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3 + %rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> + %bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3 + %43 = extractelement <8 x i32> %bin.rdx232.3, i32 0 + %bin.extra.3 = add i32 %43, %bin.extra.2 + %add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext + %add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext + %44 = bitcast i8* %add.ptr.3 to <8 x i8>* + %45 = load <8 x i8>, <8 x i8>* %44, align 1 + %46 = zext <8 x i8> %45 to <8 x i32> + %47 = bitcast i8* %add.ptr178.3 to <8 x i8>* + %48 = load <8 x i8>, <8 x i8>* %47, align 1 + %49 = zext <8 x i8> %48 to <8 x i32> + %50 = sub nsw <8 x i32> %46, %49 + %51 = icmp slt <8 x i32> %50, + %52 = sub nsw <8 x i32> zeroinitializer, %50 + %53 = select <8 x i1> %51, <8 x i32> %52, <8 x i32> %50 + %rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> + %bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4 + %rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> + %bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4 + %rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> + %bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4 + %54 = extractelement <8 x i32> %bin.rdx232.4, i32 0 + %bin.extra.4 = add i32 %54, %bin.extra.3 + %add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext + %add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext + %55 = bitcast i8* %add.ptr.4 to <8 x i8>* + %56 = load <8 x i8>, <8 x i8>* %55, align 1 + %57 = zext <8 x i8> %56 to <8 x i32> + %58 = bitcast i8* %add.ptr178.4 to <8 x i8>* + %59 = load <8 x i8>, <8 x i8>* %58, align 1 + %60 = zext <8 x i8> %59 to <8 x i32> + %61 = sub nsw <8 x i32> %57, %60 + %62 = icmp slt <8 x i32> %61, + %63 = sub nsw <8 x i32> zeroinitializer, %61 + %64 = select <8 x i1> %62, <8 x i32> %63, <8 x i32> %61 + %rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> + %bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5 + %rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> + %bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5 + %rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> + %bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5 + %65 = extractelement <8 x i32> %bin.rdx232.5, i32 0 + %bin.extra.5 = add i32 %65, %bin.extra.4 + %add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext + %add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext + %66 = bitcast i8* %add.ptr.5 to <8 x i8>* + %67 = load <8 x i8>, <8 x i8>* %66, align 1 + %68 = zext <8 x i8> %67 to <8 x i32> + %69 = bitcast i8* %add.ptr178.5 to <8 x i8>* + %70 = load <8 x i8>, <8 x i8>* %69, align 1 + %71 = zext <8 x i8> %70 to <8 x i32> + %72 = sub nsw <8 x i32> %68, %71 + %73 = icmp slt <8 x i32> %72, + %74 = sub nsw <8 x i32> zeroinitializer, %72 + %75 = select <8 x i1> %73, <8 x i32> %74, <8 x i32> %72 + %rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> + %bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6 + %rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> + %bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6 + %rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> + %bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6 + %76 = extractelement <8 x i32> %bin.rdx232.6, i32 0 + %bin.extra.6 = add i32 %76, %bin.extra.5 + %add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext + %add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext + %77 = bitcast i8* %add.ptr.6 to <8 x i8>* + %78 = load <8 x i8>, <8 x i8>* %77, align 1 + %79 = zext <8 x i8> %78 to <8 x i32> + %80 = bitcast i8* %add.ptr178.6 to <8 x i8>* + %81 = load <8 x i8>, <8 x i8>* %80, align 1 + %82 = zext <8 x i8> %81 to <8 x i32> + %83 = sub nsw <8 x i32> %79, %82 + %84 = icmp slt <8 x i32> %83, + %85 = sub nsw <8 x i32> zeroinitializer, %83 + %86 = select <8 x i1> %84, <8 x i32> %85, <8 x i32> %83 + %rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> + %bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7 + %rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> + %bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7 + %rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> + %bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7 + %87 = extractelement <8 x i32> %bin.rdx232.7, i32 0 + %bin.extra.7 = add i32 %87, %bin.extra.6 + ret i32 %bin.extra.7 +} + +define i32 @sad8_64bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 { +; SSE2-LABEL: sad8_64bit_icmp_slt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: addl %ecx, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_64bit_icmp_slt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: addl %edx, %ecx +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_64bit_icmp_slt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: movl %edx, %eax +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: addq %rax, %rdi +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: retq +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %cur, i64 %idx.ext + %add.ptr178 = getelementptr inbounds i8, i8* %ref, i64 %idx.ext + %11 = bitcast i8* %add.ptr to <8 x i8>* + %12 = load <8 x i8>, <8 x i8>* %11, align 1 + %13 = zext <8 x i8> %12 to <8 x i32> + %14 = bitcast i8* %add.ptr178 to <8 x i8>* + %15 = load <8 x i8>, <8 x i8>* %14, align 1 + %16 = zext <8 x i8> %15 to <8 x i32> + %17 = sub nsw <8 x i32> %13, %16 + %18 = icmp slt <8 x i32> %17, zeroinitializer + %19 = sub nsw <8 x i32> zeroinitializer, %17 + %20 = select <8 x i1> %18, <8 x i32> %19, <8 x i32> %17 + %rdx.shuf.1 = shufflevector <8 x i32> %20, <8 x i32> undef, <8 x i32> + %bin.rdx.1 = add <8 x i32> %20, %rdx.shuf.1 + %rdx.shuf229.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> + %bin.rdx230.1 = add <8 x i32> %bin.rdx.1, %rdx.shuf229.1 + %rdx.shuf231.1 = shufflevector <8 x i32> %bin.rdx230.1, <8 x i32> undef, <8 x i32> + %bin.rdx232.1 = add <8 x i32> %bin.rdx230.1, %rdx.shuf231.1 + %21 = extractelement <8 x i32> %bin.rdx232.1, i32 0 + %bin.extra.1 = add i32 %21, %10 + %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext + %add.ptr178.1 = getelementptr inbounds i8, i8* %add.ptr178, i64 %idx.ext + %22 = bitcast i8* %add.ptr.1 to <8 x i8>* + %23 = load <8 x i8>, <8 x i8>* %22, align 1 + %24 = zext <8 x i8> %23 to <8 x i32> + %25 = bitcast i8* %add.ptr178.1 to <8 x i8>* + %26 = load <8 x i8>, <8 x i8>* %25, align 1 + %27 = zext <8 x i8> %26 to <8 x i32> + %28 = sub nsw <8 x i32> %24, %27 + %29 = icmp slt <8 x i32> %28, zeroinitializer + %30 = sub nsw <8 x i32> zeroinitializer, %28 + %31 = select <8 x i1> %29, <8 x i32> %30, <8 x i32> %28 + %rdx.shuf.2 = shufflevector <8 x i32> %31, <8 x i32> undef, <8 x i32> + %bin.rdx.2 = add <8 x i32> %31, %rdx.shuf.2 + %rdx.shuf229.2 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> + %bin.rdx230.2 = add <8 x i32> %bin.rdx.2, %rdx.shuf229.2 + %rdx.shuf231.2 = shufflevector <8 x i32> %bin.rdx230.2, <8 x i32> undef, <8 x i32> + %bin.rdx232.2 = add <8 x i32> %bin.rdx230.2, %rdx.shuf231.2 + %32 = extractelement <8 x i32> %bin.rdx232.2, i32 0 + %bin.extra.2 = add i32 %32, %bin.extra.1 + %add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext + %add.ptr178.2 = getelementptr inbounds i8, i8* %add.ptr178.1, i64 %idx.ext + %33 = bitcast i8* %add.ptr.2 to <8 x i8>* + %34 = load <8 x i8>, <8 x i8>* %33, align 1 + %35 = zext <8 x i8> %34 to <8 x i32> + %36 = bitcast i8* %add.ptr178.2 to <8 x i8>* + %37 = load <8 x i8>, <8 x i8>* %36, align 1 + %38 = zext <8 x i8> %37 to <8 x i32> + %39 = sub nsw <8 x i32> %35, %38 + %40 = icmp slt <8 x i32> %39, zeroinitializer + %41 = sub nsw <8 x i32> zeroinitializer, %39 + %42 = select <8 x i1> %40, <8 x i32> %41, <8 x i32> %39 + %rdx.shuf.3 = shufflevector <8 x i32> %42, <8 x i32> undef, <8 x i32> + %bin.rdx.3 = add <8 x i32> %42, %rdx.shuf.3 + %rdx.shuf229.3 = shufflevector <8 x i32> %bin.rdx.3, <8 x i32> undef, <8 x i32> + %bin.rdx230.3 = add <8 x i32> %bin.rdx.3, %rdx.shuf229.3 + %rdx.shuf231.3 = shufflevector <8 x i32> %bin.rdx230.3, <8 x i32> undef, <8 x i32> + %bin.rdx232.3 = add <8 x i32> %bin.rdx230.3, %rdx.shuf231.3 + %43 = extractelement <8 x i32> %bin.rdx232.3, i32 0 + %bin.extra.3 = add i32 %43, %bin.extra.2 + %add.ptr.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 %idx.ext + %add.ptr178.3 = getelementptr inbounds i8, i8* %add.ptr178.2, i64 %idx.ext + %44 = bitcast i8* %add.ptr.3 to <8 x i8>* + %45 = load <8 x i8>, <8 x i8>* %44, align 1 + %46 = zext <8 x i8> %45 to <8 x i32> + %47 = bitcast i8* %add.ptr178.3 to <8 x i8>* + %48 = load <8 x i8>, <8 x i8>* %47, align 1 + %49 = zext <8 x i8> %48 to <8 x i32> + %50 = sub nsw <8 x i32> %46, %49 + %51 = icmp slt <8 x i32> %50, zeroinitializer + %52 = sub nsw <8 x i32> zeroinitializer, %50 + %53 = select <8 x i1> %51, <8 x i32> %52, <8 x i32> %50 + %rdx.shuf.4 = shufflevector <8 x i32> %53, <8 x i32> undef, <8 x i32> + %bin.rdx.4 = add <8 x i32> %53, %rdx.shuf.4 + %rdx.shuf229.4 = shufflevector <8 x i32> %bin.rdx.4, <8 x i32> undef, <8 x i32> + %bin.rdx230.4 = add <8 x i32> %bin.rdx.4, %rdx.shuf229.4 + %rdx.shuf231.4 = shufflevector <8 x i32> %bin.rdx230.4, <8 x i32> undef, <8 x i32> + %bin.rdx232.4 = add <8 x i32> %bin.rdx230.4, %rdx.shuf231.4 + %54 = extractelement <8 x i32> %bin.rdx232.4, i32 0 + %bin.extra.4 = add i32 %54, %bin.extra.3 + %add.ptr.4 = getelementptr inbounds i8, i8* %add.ptr.3, i64 %idx.ext + %add.ptr178.4 = getelementptr inbounds i8, i8* %add.ptr178.3, i64 %idx.ext + %55 = bitcast i8* %add.ptr.4 to <8 x i8>* + %56 = load <8 x i8>, <8 x i8>* %55, align 1 + %57 = zext <8 x i8> %56 to <8 x i32> + %58 = bitcast i8* %add.ptr178.4 to <8 x i8>* + %59 = load <8 x i8>, <8 x i8>* %58, align 1 + %60 = zext <8 x i8> %59 to <8 x i32> + %61 = sub nsw <8 x i32> %57, %60 + %62 = icmp slt <8 x i32> %61, zeroinitializer + %63 = sub nsw <8 x i32> zeroinitializer, %61 + %64 = select <8 x i1> %62, <8 x i32> %63, <8 x i32> %61 + %rdx.shuf.5 = shufflevector <8 x i32> %64, <8 x i32> undef, <8 x i32> + %bin.rdx.5 = add <8 x i32> %64, %rdx.shuf.5 + %rdx.shuf229.5 = shufflevector <8 x i32> %bin.rdx.5, <8 x i32> undef, <8 x i32> + %bin.rdx230.5 = add <8 x i32> %bin.rdx.5, %rdx.shuf229.5 + %rdx.shuf231.5 = shufflevector <8 x i32> %bin.rdx230.5, <8 x i32> undef, <8 x i32> + %bin.rdx232.5 = add <8 x i32> %bin.rdx230.5, %rdx.shuf231.5 + %65 = extractelement <8 x i32> %bin.rdx232.5, i32 0 + %bin.extra.5 = add i32 %65, %bin.extra.4 + %add.ptr.5 = getelementptr inbounds i8, i8* %add.ptr.4, i64 %idx.ext + %add.ptr178.5 = getelementptr inbounds i8, i8* %add.ptr178.4, i64 %idx.ext + %66 = bitcast i8* %add.ptr.5 to <8 x i8>* + %67 = load <8 x i8>, <8 x i8>* %66, align 1 + %68 = zext <8 x i8> %67 to <8 x i32> + %69 = bitcast i8* %add.ptr178.5 to <8 x i8>* + %70 = load <8 x i8>, <8 x i8>* %69, align 1 + %71 = zext <8 x i8> %70 to <8 x i32> + %72 = sub nsw <8 x i32> %68, %71 + %73 = icmp slt <8 x i32> %72, zeroinitializer + %74 = sub nsw <8 x i32> zeroinitializer, %72 + %75 = select <8 x i1> %73, <8 x i32> %74, <8 x i32> %72 + %rdx.shuf.6 = shufflevector <8 x i32> %75, <8 x i32> undef, <8 x i32> + %bin.rdx.6 = add <8 x i32> %75, %rdx.shuf.6 + %rdx.shuf229.6 = shufflevector <8 x i32> %bin.rdx.6, <8 x i32> undef, <8 x i32> + %bin.rdx230.6 = add <8 x i32> %bin.rdx.6, %rdx.shuf229.6 + %rdx.shuf231.6 = shufflevector <8 x i32> %bin.rdx230.6, <8 x i32> undef, <8 x i32> + %bin.rdx232.6 = add <8 x i32> %bin.rdx230.6, %rdx.shuf231.6 + %76 = extractelement <8 x i32> %bin.rdx232.6, i32 0 + %bin.extra.6 = add i32 %76, %bin.extra.5 + %add.ptr.6 = getelementptr inbounds i8, i8* %add.ptr.5, i64 %idx.ext + %add.ptr178.6 = getelementptr inbounds i8, i8* %add.ptr178.5, i64 %idx.ext + %77 = bitcast i8* %add.ptr.6 to <8 x i8>* + %78 = load <8 x i8>, <8 x i8>* %77, align 1 + %79 = zext <8 x i8> %78 to <8 x i32> + %80 = bitcast i8* %add.ptr178.6 to <8 x i8>* + %81 = load <8 x i8>, <8 x i8>* %80, align 1 + %82 = zext <8 x i8> %81 to <8 x i32> + %83 = sub nsw <8 x i32> %79, %82 + %84 = icmp slt <8 x i32> %83, zeroinitializer + %85 = sub nsw <8 x i32> zeroinitializer, %83 + %86 = select <8 x i1> %84, <8 x i32> %85, <8 x i32> %83 + %rdx.shuf.7 = shufflevector <8 x i32> %86, <8 x i32> undef, <8 x i32> + %bin.rdx.7 = add <8 x i32> %86, %rdx.shuf.7 + %rdx.shuf229.7 = shufflevector <8 x i32> %bin.rdx.7, <8 x i32> undef, <8 x i32> + %bin.rdx230.7 = add <8 x i32> %bin.rdx.7, %rdx.shuf229.7 + %rdx.shuf231.7 = shufflevector <8 x i32> %bin.rdx230.7, <8 x i32> undef, <8 x i32> + %bin.rdx232.7 = add <8 x i32> %bin.rdx230.7, %rdx.shuf231.7 + %87 = extractelement <8 x i32> %bin.rdx232.7, i32 0 + %bin.extra.7 = add i32 %87, %bin.extra.6 + ret i32 %bin.extra.7 +}