Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29469,8 +29469,14 @@ SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res, DAG.getIntPtrConstant(0, DL)); - } else + } else { + // If the reduction vector is narrower than the result of the sadbw, we can + // just take the low part of the sad without losing any elements. + if (VT.getSizeInBits() < ResVT.getSizeInBits()) + Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad, + DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); + } } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -971,3 +971,128 @@ %12 = extractelement <64 x i32> %bin.rdx6, i32 0 ret i32 %12 } + +define i32 @sad_2i8() nounwind { +; SSE2-LABEL: sad_2i8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB3_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psadbw %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: addq $4, %rax +; SSE2-NEXT: jne .LBB3_1 +; SSE2-NEXT: # BB#2: # %middle.block +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad_2i8: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB3_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: addq $4, %rax +; AVX2-NEXT: jne .LBB3_1 +; AVX2-NEXT: # BB#2: # %middle.block +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_2i8: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: .p2align 4, 0x90 +; AVX512F-NEXT: .LBB3_1: # %vector.body +; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: addq $4, %rax +; AVX512F-NEXT: jne .LBB3_1 +; AVX512F-NEXT: # BB#2: # %middle.block +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_2i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: .p2align 4, 0x90 +; AVX512BW-NEXT: .LBB3_1: # %vector.body +; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: addq $4, %rax +; AVX512BW-NEXT: jne .LBB3_1 +; AVX512BW-NEXT: # BB#2: # %middle.block +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %1, align 4 + %2 = zext <2 x i8> %wide.load to <2 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <2 x i8>* + %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4 + %5 = zext <2 x i8> %wide.load1 to <2 x i32> + %6 = sub nsw <2 x i32> %2, %5 + %7 = icmp sgt <2 x i32> %6, + %8 = sub nsw <2 x i32> zeroinitializer, %6 + %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8 + %10 = add nsw <2 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <2 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> + %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf + %12 = extractelement <2 x i32> %bin.rdx, i32 0 + ret i32 %12 +} +