Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -393,7 +393,7 @@ SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, - SDValue N1); + SDValue N1, SDNodeFlags Flags); SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); @@ -942,9 +942,17 @@ } SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, - SDValue N1) { + SDValue N1, SDNodeFlags Flags) { + // Don't reassociate reductions. + if (Flags.hasVectorReduction()) + return SDValue(); + EVT VT = N0.getValueType(); if (N0.getOpcode() == Opc) { + // Don't reassociate reductions. + if (N0->getFlags().hasVectorReduction()) + return SDValue(); + if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) @@ -965,6 +973,10 @@ } if (N1.getOpcode() == Opc) { + // Don't reassociate reductions. + if (N1->getFlags().hasVectorReduction()) + return SDValue(); + if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) { if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) { // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) @@ -2110,7 +2122,7 @@ return NewSel; // reassociate add - if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1)) + if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) return RADD; // fold ((0-A) + B) -> B-A @@ -2974,7 +2986,7 @@ N0.getOperand(1), N1)); // reassociate mul - if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1)) + if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) return RMUL; return SDValue(); @@ -4429,7 +4441,7 @@ return NewSel; // reassociate and - if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) + if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) return RAND; // Try to convert a constant mask AND into a shuffle clear mask. @@ -5139,7 +5151,7 @@ return BSwap; // reassociate or - if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) + if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) @@ -6016,7 +6028,7 @@ return NewSel; // reassociate xor - if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1)) + if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1, N->getFlags())) return RXOR; // fold !(x cc y) -> (x !cc y) Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -1395,3 +1395,113 @@ %sum = extractelement <32 x i32> %sum3, i32 0 ret i32 %sum } + +; This contains an unrolled sad loop with a non-zero initial value. +; DAGCombiner reassociation previously rewrote the adds to move the constant vector further down the tree. This resulted in the vector-reduction flag being lost. +define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %arg2, <16 x i8>* %arg3) { +; SSE2-LABEL: sad_unroll_nonzero_initial: +; SSE2: # %bb.0: # %bb +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movl $1, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX1-LABEL: sad_unroll_nonzero_initial: +; AVX1: # %bb.0: # %bb +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rdx), %xmm1 +; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_unroll_nonzero_initial: +; AVX2: # %bb.0: # %bb +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu (%rdx), %xmm1 +; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_unroll_nonzero_initial: +; AVX512: # %bb.0: # %bb +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu (%rdx), %xmm1 +; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +bb: + %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 + %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 + %tmp5 = zext <16 x i8> %tmp to <16 x i32> + %tmp6 = zext <16 x i8> %tmp4 to <16 x i32> + %tmp7 = sub nsw <16 x i32> %tmp5, %tmp6 + %tmp8 = icmp slt <16 x i32> %tmp7, zeroinitializer + %tmp9 = sub nsw <16 x i32> zeroinitializer, %tmp7 + %tmp10 = select <16 x i1> %tmp8, <16 x i32> %tmp9, <16 x i32> %tmp7 + %tmp11 = add nuw nsw <16 x i32> %tmp10, + %tmp12 = load <16 x i8>, <16 x i8>* %arg2, align 1 + %tmp13 = load <16 x i8>, <16 x i8>* %arg3, align 1 + %tmp14 = zext <16 x i8> %tmp12 to <16 x i32> + %tmp15 = zext <16 x i8> %tmp13 to <16 x i32> + %tmp16 = sub nsw <16 x i32> %tmp14, %tmp15 + %tmp17 = icmp slt <16 x i32> %tmp16, zeroinitializer + %tmp18 = sub nsw <16 x i32> zeroinitializer, %tmp16 + %tmp19 = select <16 x i1> %tmp17, <16 x i32> %tmp18, <16 x i32> %tmp16 + %tmp20 = add nuw nsw <16 x i32> %tmp19, %tmp11 + %tmp21 = shufflevector <16 x i32> %tmp20, <16 x i32> undef, <16 x i32> + %tmp22 = add <16 x i32> %tmp20, %tmp21 + %tmp23 = shufflevector <16 x i32> %tmp22, <16 x i32> undef, <16 x i32> + %tmp24 = add <16 x i32> %tmp22, %tmp23 + %tmp25 = shufflevector <16 x i32> %tmp24, <16 x i32> undef, <16 x i32> + %tmp26 = add <16 x i32> %tmp24, %tmp25 + %tmp27 = shufflevector <16 x i32> %tmp26, <16 x i32> undef, <16 x i32> + %tmp28 = add <16 x i32> %tmp26, %tmp27 + %tmp29 = extractelement <16 x i32> %tmp28, i64 0 + ret i32 %tmp29 +} +