Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29456,11 +29456,16 @@ SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1); // We need to turn the vector of i64 into a vector of i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); - Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); + // If the reduction vector is at least as wide as the psadbw result, just + // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero + // anyway. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + if (VT.getSizeInBits() >= ResVT.getSizeInBits()) + Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); + else + Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); - NumConcat = VT.getSizeInBits() / ResVT.getSizeInBits(); - if (NumConcat > 1) { + if (VT.getSizeInBits() > ResVT.getSizeInBits()) { // Update part of elements of the reduction vector. This is done by first // extracting a sub-vector from it, updating this sub-vector, and inserting // it back. Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -971,3 +971,124 @@ %12 = extractelement <64 x i32> %bin.rdx6, i32 0 ret i32 %12 } + +define i32 @sad_2i8() nounwind { +; SSE2-LABEL: sad_2i8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB3_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psadbw %xmm3, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: addq $4, %rax +; SSE2-NEXT: jne .LBB3_1 +; SSE2-NEXT: # BB#2: # %middle.block +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: sad_2i8: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB3_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: addq $4, %rax +; AVX2-NEXT: jne .LBB3_1 +; AVX2-NEXT: # BB#2: # %middle.block +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_2i8: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: .p2align 4, 0x90 +; AVX512F-NEXT: .LBB3_1: # %vector.body +; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: addq $4, %rax +; AVX512F-NEXT: jne .LBB3_1 +; AVX512F-NEXT: # BB#2: # %middle.block +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_2i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFFFFFFFFFFFFFC00 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: .p2align 4, 0x90 +; AVX512BW-NEXT: .LBB3_1: # %vector.body +; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: addq $4, %rax +; AVX512BW-NEXT: jne .LBB3_1 +; AVX512BW-NEXT: # BB#2: # %middle.block +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %1, align 4 + %2 = zext <2 x i8> %wide.load to <2 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <2 x i8>* + %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4 + %5 = zext <2 x i8> %wide.load1 to <2 x i32> + %6 = sub nsw <2 x i32> %2, %5 + %7 = icmp sgt <2 x i32> %6, + %8 = sub nsw <2 x i32> zeroinitializer, %6 + %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8 + %10 = add nsw <2 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <2 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> + %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf + %12 = extractelement <2 x i32> %bin.rdx, i32 0 + ret i32 %12 +} +