Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29036,12 +29036,18 @@ if (SetCC.getOpcode() != ISD::SETCC) return false; ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT) + if (CC != ISD::SETGT && CC != ISD::SETLT) return false; SDValue SelectOp1 = Select->getOperand(1); SDValue SelectOp2 = Select->getOperand(2); + // The following instructions assume SelectOp1 is the subtraction operand + // and SelectOp2 is the negation operand. + // In the case of SETLT this is the other way around. + if (CC == ISD::SETLT) + std::swap(SelectOp1, SelectOp2); + // The second operand of the select should be the negation of the first // operand, which is implemented as 0 - SelectOp1. if (!(SelectOp2.getOpcode() == ISD::SUB && @@ -29054,8 +29060,17 @@ if (SetCC.getOperand(0) != SelectOp1) return false; - // The second operand of the comparison can be either -1 or 0. - if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + // In SetLT case, The second operand of the comparison can be either 1 or 0. + APInt SplatVal; + if ((CC == ISD::SETLT) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && + SplatVal == 1) || + (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) + return false; + + // In SetGT case, The second operand of the comparison can be either -1 or 0. + if ((CC == ISD::SETGT) && + !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) return false; @@ -29184,11 +29199,9 @@ if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is appropriate - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. + // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; @@ -29197,15 +29210,28 @@ else if (Subtarget.hasAVX2()) RegSize = 256; - // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) + if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + // The operand is expected to be zero extended from i8 + // (verified in detectZextAbsDiff). + // In order to convert to i64 and above, additional any/zero/sign + // extend is expected. + // The zero extend from 32 bit has no mathematical effect on the result. + // Also the sign extend is basically zero extend + // (extends the sign bit which is zero). + // So it is correct to skip the sign/zero extend instruction. + if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) + Root = Root.getOperand(0); + // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || (Root.getOpcode() != ISD::VSELECT)) @@ -29216,7 +29242,7 @@ if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); - // Create the SAD instruction + // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); @@ -29238,10 +29264,12 @@ } } - // Return the lowest i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + MVT Type = Extract->getSimpleValueType(0); + unsigned TypeSizeInBits = Type.getSizeInBits(); + // Return the lowest TypeSizeInBits bits. + MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } Index: test/CodeGen/X86/sad_variations.ll =================================================================== --- test/CodeGen/X86/sad_variations.ll +++ test/CodeGen/X86/sad_variations.ll @@ -0,0 +1,347 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F + +define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 { +; SSE2-LABEL: sad8_32bit_icmp_sge: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_32bit_icmp_sge: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_32bit_icmp_sge: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq + +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp sgt <8 x i32> %6, + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + ret i32 %10 +} + +define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 { +; SSE2-LABEL: sad8_32bit_icmp_sgt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_32bit_icmp_sgt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_32bit_icmp_sgt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp sgt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + ret i32 %10 +} + +define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 { +; SSE2-LABEL: sad8_32bit_icmp_sle: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_32bit_icmp_sle: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_32bit_icmp_sle: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + ret i32 %10 +} + +define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 { +; SSE2-LABEL: sad8_32bit_icmp_slt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_32bit_icmp_slt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_32bit_icmp_slt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +entry: + %idx.ext = zext i32 %stride to i64 + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 + %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> + %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 + %10 = extractelement <8 x i32> %bin.rdx232, i32 0 + ret i32 %10 +} + +define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { +; SSE2-LABEL: sad8_64bit_icmp_sext_slt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_64bit_icmp_sext_slt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_64bit_icmp_sext_slt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: retq +entry: + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %10 = sext <8 x i32> %9 to <8 x i64> + %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> + %bin.rdx = add <8 x i64> %rdx.shuf, %10 + %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> + %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 + %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> + %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 + %11 = extractelement <8 x i64> %bin.rdx239, i32 0 + ret i64 %11 +} + +define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { +; SSE2-LABEL: sad8_64bit_icmp_zext_slt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_64bit_icmp_zext_slt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_64bit_icmp_zext_slt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: retq +entry: + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %10 = zext <8 x i32> %9 to <8 x i64> + %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> + %bin.rdx = add <8 x i64> %rdx.shuf, %10 + %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> + %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 + %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> + %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 + %11 = extractelement <8 x i64> %bin.rdx239, i32 0 + ret i64 %11 +} + +define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { +; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: retq +entry: + br label %for.body + +for.body: ; preds = %entry + %0 = bitcast i8* %cur to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i64> + %3 = bitcast i8* %ref to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i64> + %6 = sub nsw <8 x i64> %2, %5 + %7 = icmp slt <8 x i64> %6, zeroinitializer + %8 = sub nsw <8 x i64> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6 + %rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> + %bin.rdx = add <8 x i64> %rdx.shuf, %9 + %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> + %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 + %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> + %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 + %10 = extractelement <8 x i64> %bin.rdx239, i32 0 + ret i64 %10 +}