Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4459,11 +4459,12 @@ // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth); + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, ReducedSubTree, "bin.rdx"); + propagateIRFlags(VectorizedTree, ReductionOps); } else VectorizedTree = ReducedSubTree; i += ReduxWidth; @@ -4477,6 +4478,7 @@ Builder.SetCurrentDebugLocation(I->getDebugLoc()); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); + propagateIRFlags(VectorizedTree, ReductionOps); } for (auto &Pair : ExternallyUsedValues) { assert(!Pair.second.empty() && @@ -4527,7 +4529,7 @@ /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth) { + unsigned ReduxWidth, ArrayRef RedOps) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); @@ -4554,6 +4556,7 @@ TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); } + propagateIRFlags(TmpVec, RedOps); } // The result is in the first element of the vector. Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -624,9 +624,9 @@ ; STORE-LABEL: @i32_red_example4( ; STORE: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void @@ -647,11 +647,11 @@ ; STORE-LABEL: @i32_red_example8( ; STORE: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; STORE-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; STORE-NEXT: [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void @@ -680,13 +680,13 @@ ; STORE-LABEL: @i32_red_example16( ; STORE: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <16 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX4:%.*]] = add <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; STORE-NEXT: [[BIN_RDX4:%.*]] = add nsw <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX6:%.*]] = add <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; STORE-NEXT: [[BIN_RDX6:%.*]] = add nsw <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void @@ -731,15 +731,15 @@ ; STORE-LABEL: @i32_red_example32( ; STORE: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <32 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <32 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX4:%.*]] = add <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; STORE-NEXT: [[BIN_RDX4:%.*]] = add nsw <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX6:%.*]] = add <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; STORE-NEXT: [[BIN_RDX6:%.*]] = add nsw <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] ; STORE-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX8:%.*]] = add <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; STORE-NEXT: [[BIN_RDX8:%.*]] = add nsw <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -8,9 +8,9 @@ ; CHECK: [[S1:%.+]] = add nsw <4 x i32> ; CHECK: store <4 x i32> [[S1]], ; CHECK: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[S1]], [[RDX_SHUF]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 ; CHECK: [[ADD52:%.*]] = add nsw i32 [[TMP15]], ; CHECK: ret i32 [[ADD52]]