Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4062,7 +4062,14 @@ SmallVector ReducedVals; BinaryOperator *ReductionRoot; - PHINode *ReductionPHI; + // After successfull horizontal reduction vectorization attempt for PHI node + // vectorizer tries to update root binary op by combining vectorized tree and + // the ReductionPHI node. But during vectorization this ReductionPHI can be + // vectorized itself and replaced by the undef value, while the instruction + // itself is marked for deletion. This 'marked for deletion' PHI node then can + // be used in new binary operation, causing "Use still stuck around after Def + // is destroyed" crash upon PHI node deletion. + WeakVH ReductionPHI; /// The opcode of the reduction. unsigned ReductionOpcode; @@ -4081,8 +4088,8 @@ unsigned MinVecRegSize; HorizontalReduction(unsigned MinVecRegSize) - : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), - ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0), + : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), + IsPairwiseReduction(false), ReduxWidth(0), MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. @@ -4247,7 +4254,7 @@ ReducedVals[i]); } // Update users. - if (ReductionPHI) { + if (ReductionPHI && !isa(ReductionPHI)) { assert(ReductionRoot && "Need a reduction operation"); ReductionRoot->setOperand(0, VectorizedTree); ReductionRoot->setOperand(1, ReductionPHI); Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -1,6 +1,5 @@ -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s ; #include ; @@ -15,9 +14,9 @@ ; return sum; ; } -; NOSTORE-LABEL: add_red -; NOSTORE: fmul <4 x float> -; NOSTORE: shufflevector <4 x float> +; CHECK-LABEL: add_red +; CHECK: fmul <4 x float> +; CHECK: shufflevector <4 x float> define i32 @add_red(float* %A, i32 %n) { entry: @@ -148,8 +147,8 @@ ; } ; CHECK-LABEL: long_red -; CHECK: fmul fast <4 x float> -; CHECK: shufflevector <4 x float> +; CHECK: fmul fast <8 x float> +; CHECK: shufflevector <8 x float> define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { entry: @@ -305,6 +304,149 @@ ret i32 %sum.0.lcssa } +; void foo(const float *arg_A, unsigned arg_B, float *array) { +; for (uint32_t i = 0; i < 6; ++i) { +; const float *ptr = arg_A + i; +; float w0 = array[i * 4 + 0]; +; float w1 = array[i * 4 + 1]; +; float w2 = array[i * 4 + 2]; +; float w3 = array[i * 4 + 3]; +; +; for (unsigned j = 0; j < arg_B; ++j) { +; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1); +; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1); +; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3); +; const float x4 = x3 + (-4.0f * w2) + w3; +; w1 = w0; +; w0 = x1; +; w3 = w2; +; w2 = x3; +; } +; +; array[i * 4 + 0] = w0; +; array[i * 4 + 1] = w1; +; array[i * 4 + 2] = w2; +; array[i * 4 + 3] = w3; +; } +; } + +define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) { +; CHECK-LABEL: @foo( +; CHECK: fmul fast <4 x float> +; CHECK: shufflevector <4 x float> +; +entry: + %cmp1495 = icmp eq i32 %arg_B, 0 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup15 + ret void + +for.body: ; preds = %for.cond.cleanup15, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ] + %0 = shl i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds float, float* %array, i64 %0 + %1 = load float, float* %arrayidx, align 4 + %2 = or i64 %0, 1 + %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2 + %3 = load float, float* %arrayidx4, align 4 + %4 = or i64 %0, 2 + %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4 + %5 = load float, float* %arrayidx8, align 4 + %6 = or i64 %0, 3 + %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6 + %7 = load float, float* %arrayidx12, align 4 + br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph + +for.body16.lr.ph: ; preds = %for.body + %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv + %8 = load float, float* %add.ptr, align 4 + br label %for.body16 + +for.cond.cleanup15: ; preds = %for.body16, %for.body + %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ] + %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ] + %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ] + %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ] + store float %w0.0.lcssa, float* %arrayidx, align 4 + store float %w1.0.lcssa, float* %arrayidx4, align 4 + store float %w2.0.lcssa, float* %arrayidx8, align 4 + store float %w3.0.lcssa, float* %arrayidx12, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond109 = icmp eq i64 %indvars.iv.next, 6 + br i1 %exitcond109, label %for.cond.cleanup, label %for.body + +for.body16: ; preds = %for.body16, %for.body16.lr.ph + %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ] + %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ] + %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ] + %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ] + %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ] + %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000 + %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000 + %sub92 = fadd fast float %mul17, %mul18.neg + %sub19 = fadd fast float %sub92, %8 + %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000 + %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000 + %mul23 = fmul fast float %w1.099, 0x4002666660000000 + %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000 + %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000 + %add2293 = fadd fast float %mul27.neg, %mul25 + %add24 = fadd fast float %add2293, %mul23 + %sub2694 = fadd fast float %add24, %mul21.neg + %sub28 = fadd fast float %sub2694, %mul20 + %inc = add nuw i32 %j.098, 1 + %exitcond = icmp eq i32 %inc, %arg_B + br i1 %exitcond, label %for.cond.cleanup15, label %for.body16 +} + +; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE + +; void foo(double * restrict A, double * restrict B, double * restrict C, +; int n) { +; for (intptr_t i=0; i < n; ++i) { +; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; +; } +; } + +; STORE-LABEL: store_red_double +; STORE: fmul fast <2 x double> +; STORE: extractelement <2 x double> +; STORE: extractelement <2 x double> + +define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { +entry: + %cmp17 = icmp sgt i32 %n, 0 + br i1 %cmp17, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = load double, double* %B, align 8 + %arrayidx4 = getelementptr inbounds double, double* %B, i64 1 + %1 = load double, double* %arrayidx4, align 8 + %2 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %mul = shl nsw i64 %i.018, 2 + %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul + %3 = load double, double* %arrayidx2, align 8 + %mul3 = fmul fast double %0, %3 + %add16 = or i64 %mul, 1 + %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16 + %4 = load double, double* %arrayidx6, align 8 + %mul7 = fmul fast double %1, %4 + %add8 = fadd fast double %mul3, %mul7 + %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018 + store double %add8, double* %arrayidx9, align 8 + %inc = add nsw i64 %i.018, 1 + %exitcond = icmp eq i64 %inc, %2 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { ; float sum = 0; ; for (intptr_t i=0; i < n; ++i) { @@ -316,9 +458,9 @@ ; return sum; ; } -; CHECK-LABEL: store_red -; CHECK: fmul fast <4 x float> -; CHECK: shufflevector <4 x float> +; STORE-LABEL: store_red +; STORE: fmul fast <4 x float> +; STORE: shufflevector <4 x float> define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { entry: @@ -368,50 +510,3 @@ ret i32 0 } - -; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE - -; void foo(double * restrict A, double * restrict B, double * restrict C, -; int n) { -; for (intptr_t i=0; i < n; ++i) { -; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; -; } -; } - -; STORE-LABEL: store_red_double -; STORE: fmul fast <2 x double> -; STORE: extractelement <2 x double> -; STORE: extractelement <2 x double> - -define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { -entry: - %cmp17 = icmp sgt i32 %n, 0 - br i1 %cmp17, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %0 = load double, double* %B, align 8 - %arrayidx4 = getelementptr inbounds double, double* %B, i64 1 - %1 = load double, double* %arrayidx4, align 8 - %2 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %mul = shl nsw i64 %i.018, 2 - %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul - %3 = load double, double* %arrayidx2, align 8 - %mul3 = fmul fast double %0, %3 - %add16 = or i64 %mul, 1 - %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16 - %4 = load double, double* %arrayidx6, align 8 - %mul7 = fmul fast double %1, %4 - %add8 = fadd fast double %mul3, %mul7 - %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018 - store double %add8, double* %arrayidx9, align 8 - %inc = add nsw i64 %i.018, 1 - %exitcond = icmp eq i64 %inc, %2 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -}