diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4267,16 +4267,35 @@ // Floating-point operations should have some FMF to enable the reduction. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = State.get(LoopExitInstDef, Part); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { - ReducedPartRdx = Builder.CreateBinOp( - (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) - ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + // For arithmetic reductions, generate a reduction tree by repeatedly + // combining adjacent parts. + SmallVector ToReduce; + for (unsigned Part = 0; Part < UF; ++Part) + ToReduce.push_back(State.get(LoopExitInstDef, Part)); + unsigned NumValuesToReduce = ToReduce.size(); + while (NumValuesToReduce > 1) { + for (unsigned I = 0; I < (NumValuesToReduce - 1); I += 2) { + Value *A = ToReduce[I]; + Value *B = ToReduce[I + 1]; + ToReduce[I / 2] = + Builder.CreateBinOp((Instruction::BinaryOps)Op, A, B, "bin.rdx"); + } + if (NumValuesToReduce % 2 != 0) + ToReduce[NumValuesToReduce / 2] = ToReduce[NumValuesToReduce - 1]; + + NumValuesToReduce = NumValuesToReduce / 2 + NumValuesToReduce % 2; + } + ReducedPartRdx = ToReduce[0]; + } else { + for (unsigned Part = 1; Part < UF; ++Part) { + Value *RdxPart = State.get(LoopExitInstDef, Part); + if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) + ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); + else + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } } } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -14,7 +14,7 @@ ; CHECK: %[[ADD1:.*]] = add %[[LOAD1]] ; CHECK: %[[ADD2:.*]] = add %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] +; CHECK: %[[ADD:.*]] = add %[[ADD1]], %[[ADD2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32( %[[ADD]]) entry: br label %for.body @@ -44,7 +44,7 @@ ; CHECK: %[[OR1:.*]] = or %[[LOAD1]] ; CHECK: %[[OR2:.*]] = or %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[OR:.*]] = or %[[OR2]], %[[OR1]] +; CHECK: %[[OR:.*]] = or %[[OR1]], %[[OR2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32( %[[OR]]) entry: br label %for.body @@ -74,7 +74,7 @@ ; CHECK: %[[AND1:.*]] = and %[[LOAD1]] ; CHECK: %[[AND2:.*]] = and %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[ABD:.*]] = and %[[ADD2]], %[[AND1]] +; CHECK: %[[ABD:.*]] = and %[[ADD1]], %[[AND2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32( %[[ADD]]) entry: br label %for.body @@ -104,7 +104,7 @@ ; CHECK: %[[XOR1:.*]] = xor %[[LOAD1]] ; CHECK: %[[XOR2:.*]] = xor %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[XOR:.*]] = xor %[[XOR2]], %[[XOR1]] +; CHECK: %[[XOR:.*]] = xor %[[XOR1]], %[[XOR2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32( %[[XOR]]) entry: br label %for.body @@ -202,7 +202,7 @@ ; CHECK: %[[ADD1:.*]] = fadd fast %[[LOAD1]] ; CHECK: %[[ADD2:.*]] = fadd fast %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[ADD:.*]] = fadd fast %[[ADD2]], %[[ADD1]] +; CHECK: %[[ADD:.*]] = fadd fast %[[ADD1]], %[[ADD2]] ; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[ADD]]) entry: br label %for.body @@ -231,7 +231,7 @@ ; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]] ; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]] +; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD1]], %[[FADD2]] ; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]]) entry: br label %for.body @@ -332,7 +332,7 @@ ; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] ; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL1]], %[[MUL2]] ; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body @@ -366,7 +366,7 @@ ; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] ; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL1]], %[[MUL2]] ; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -82,9 +82,9 @@ ; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd %[[VEC_LOAD4]], %[[VEC_PHI4]] ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd %[[VEC_FADD4]], %[[BIN_RDX2]] +; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd %[[VEC_FADD1]], %[[VEC_FADD2]] +; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd %[[VEC_FADD3]], %[[VEC_FADD4]] +; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd %[[BIN_RDX1]], %[[BIN_RDX2]] ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[BIN_RDX3]]) ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* @@ -435,9 +435,9 @@ ; CHECK-UNORDERED: [[FMULADD3]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd [[FMULADD]], [[FMULADD1]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd [[FMULADD2]], [[FMULADD3]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd [[BIN_RDX]], [[BIN_RDX1]] ; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] @@ -515,9 +515,9 @@ ; CHECK-UNORDERED: [[FMULADD3]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan [[FMULADD]], [[FMULADD1]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan [[FMULADD2]], [[FMULADD3]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan [[BIN_RDX]], [[BIN_RDX1]] ; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -128,9 +128,9 @@ ; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]] ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]] +; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_FADD2]] +; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[VEC_FADD4]] +; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[BIN_RDX1]], %[[BIN_RDX2]] ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]]) ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* @@ -208,9 +208,9 @@ ; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_PHI4]], %[[VEC_LOAD4]] ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]] +; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_FADD2]] +; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[VEC_FADD4]] +; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[BIN_RDX1]], %[[BIN_RDX2]] ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]]) ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float* @@ -780,9 +780,9 @@ ; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]] ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]] -; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]] -; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]] +; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD1]], %[[FADD2]] +; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[FADD4]] +; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[BIN_RDX1]], %[[BIN_RDX2]] ; CHECK-UNORDERED: scalar.ph ; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ] ; CHECK-UNORDERED: for.body @@ -853,9 +853,9 @@ ; CHECK-UNORDERED: [[FADD4]] = fadd nnan float [[LOAD4]], [[VEC_PHI4]] ; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD2]], [[FADD1]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[FADD4]], [[BIN_RDX2]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD1]], [[FADD2]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[FADD4]] +; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[BIN_RDX1]], [[BIN_RDX2]] ; CHECK-UNORDERED: scalar.ph: ; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX3]], %middle.block ] ; CHECK-UNORDERED: for.body: @@ -1096,9 +1096,9 @@ ; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD]], [[FMULADD1]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[FMULADD3]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX]], [[BIN_RDX1]] ; CHECK-UNORDERED: scalar.ph: ; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ] ; CHECK-UNORDERED: for.body: diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -212,9 +212,9 @@ ; CHECK-NEXT: [[TMP123:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP120]], [[TMP119]] -; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP119]], [[TMP120]] +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[TMP122]] +; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[BIN_RDX4]] ; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -52,9 +52,9 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX13]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <16 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <16 x i32> [[BIN_RDX]], [[BIN_RDX13]] ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX14]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -94,9 +94,9 @@ ; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] -; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX19]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP36]], [[TMP37]] +; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP38]], [[TMP39]] +; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX19]] ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -260,9 +260,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -447,9 +447,9 @@ ; CHECK-NEXT: [[TMP104:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP104]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]] -; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP100]], [[TMP101]] +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[TMP103]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX7]] ; CHECK-NEXT: [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -773,9 +773,9 @@ ; CHECK-NEXT: [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP180]], [[TMP181]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[TMP183]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX37]] ; CHECK-NEXT: [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -950,9 +950,9 @@ ; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP82]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[TMP84]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -1123,9 +1123,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -1474,9 +1474,9 @@ ; CHECK-NEXT: [[TMP152:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2048 ; CHECK-NEXT: br i1 [[TMP152]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP148]], [[TMP149]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[TMP151]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX37]] ; CHECK-NEXT: [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 2048, 2048 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -1641,9 +1641,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -1808,9 +1808,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -1975,9 +1975,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -2151,9 +2151,9 @@ ; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP82]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[TMP84]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -2325,9 +2325,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -2493,9 +2493,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -2671,9 +2671,9 @@ ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -83,7 +83,7 @@ ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP25]], [[TMP24]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP24]], [[TMP25]] ; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) ; CHECK-NEXT: ret i8 [[TMP7]] ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -75,7 +75,7 @@ ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -147,7 +147,7 @@ ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -219,7 +219,7 @@ ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -93,9 +93,9 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -539,7 +539,7 @@ ; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP50]], [[TMP51]] ; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] @@ -630,9 +630,9 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -705,9 +705,9 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[BIN_RDX10]] ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -2848,7 +2848,7 @@ ; UNROLL-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240 ; UNROLL-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP48]], [[TMP47]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP47]], [[TMP48]] ; UNROLL-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: @@ -2937,7 +2937,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240 ; UNROLL-NO-IC-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP50]], [[TMP51]] ; UNROLL-NO-IC-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP41]], i32 3 @@ -3005,7 +3005,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240 ; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP14]], [[TMP15]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -5390,7 +5390,7 @@ ; UNROLL: middle.block: ; UNROLL-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI2]] ; UNROLL-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP48]], [[TMP49]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: @@ -5516,7 +5516,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP48]], [[TMP49]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP43]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP43]], i32 2 @@ -5584,7 +5584,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51:![0-9]+]], !llvm.loop [[LOOP52:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP10]], [[TMP11]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: ; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] @@ -6096,7 +6096,7 @@ ; UNROLL: middle.block: ; UNROLL-NEXT: [[TMP79:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI7]] ; UNROLL-NEXT: [[TMP80:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP79]], [[TMP80]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP80]], [[TMP79]] ; UNROLL-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: @@ -6285,7 +6285,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP72]], [[TMP73]] ; UNROLL-NO-IC-NEXT: [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP43]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP43]], i32 2 @@ -6370,7 +6370,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51]], !llvm.loop [[LOOP55:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP12]], [[TMP13]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: ; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -236,7 +236,7 @@ ; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI]], [[PREDPHI5]] ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; UNROLL-NEXT: [[TMP14:%.*]] = xor i1 [[CMP_N]], true ; UNROLL-NEXT: call void @llvm.assume(i1 [[TMP14]]) @@ -312,7 +312,7 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: -; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] +; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI]], [[PREDPHI5]] ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_INC26_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -762,7 +762,7 @@ ; UNROLL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP5]], [[TMP4]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] ; UNROLL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -812,7 +812,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP8]], [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -857,7 +857,7 @@ ; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP5]], [[TMP4]] +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]]) ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1084,7 +1084,7 @@ ; UNROLL-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP35]], [[TMP34]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP34]], [[TMP35]] ; UNROLL-NEXT: [[TMP37:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[BIN_RDX]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1165,7 +1165,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP36]], [[TMP35]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP35]], [[TMP36]] ; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1237,7 +1237,7 @@ ; INTERLEAVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[TMP18]] +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP18]], [[TMP19]] ; INTERLEAVE-NEXT: [[TMP21:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; INTERLEAVE-NEXT: br label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: @@ -2228,7 +2228,7 @@ ; UNROLL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP29]], [[TMP28]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP28]], [[TMP29]] ; UNROLL-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2334,7 +2334,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP33]], [[TMP32]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP32]], [[TMP33]] ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2465,7 +2465,7 @@ ; INTERLEAVE-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP48]], [[TMP49]] ; INTERLEAVE-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2946,7 +2946,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP5]], [[TMP4]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP4]], [[TMP5]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[TMP12:%.*]], label [[SCALAR_PH]] @@ -3086,7 +3086,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536 ; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP5]], [[TMP4]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP4]], [[TMP5]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 65536, 65536 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[TMP12:%.*]], label [[SCALAR_PH]] @@ -3228,7 +3228,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP4]], [[TMP3]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP3]], [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[TMP11:%.*]], label [[SCALAR_PH]] @@ -3457,7 +3457,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP9]], [[TMP8]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP8]], [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -5529,7 +5529,7 @@ ; UNROLL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 ; UNROLL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP29]], [[TMP28]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP28]], [[TMP29]] ; UNROLL-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: @@ -5612,7 +5612,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20 ; UNROLL-NO-IC-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP29]], [[TMP28]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = or <2 x i32> [[TMP28]], [[TMP29]] ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 20, 20 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -5739,7 +5739,7 @@ ; INTERLEAVE-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; INTERLEAVE-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP49]], [[TMP48]] +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP48]], [[TMP49]] ; INTERLEAVE-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX]]) ; INTERLEAVE-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: @@ -5976,7 +5976,7 @@ ; UNROLL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112 ; UNROLL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP21]], [[TMP20]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP20]], [[TMP21]] ; UNROLL-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD9]], i64 1 ; UNROLL-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -6059,7 +6059,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112 ; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP21]], [[TMP20]] +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP20]], [[TMP21]] ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 113, 112 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD9]], i32 1 @@ -6141,7 +6141,7 @@ ; INTERLEAVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112 ; INTERLEAVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP21]], [[TMP20]] +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP20]], [[TMP21]] ; INTERLEAVE-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]]) ; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD9]], i64 3 ; INTERLEAVE-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -39,9 +39,9 @@ ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP9]] -; CHECK-NEXT: [[BIN_RDX7:%.*]] = add i32 [[TMP13]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX8:%.*]] = add i32 [[TMP15]], [[BIN_RDX7]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add i32 [[TMP13]], [[TMP15]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add i32 [[BIN_RDX]], [[BIN_RDX7]] ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -266,9 +266,9 @@ ; CHECK-NEXT: [[TMP111:%.*]] = icmp eq i64 [[INDEX_NEXT]], 272 ; CHECK-NEXT: br i1 [[TMP111]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP104]], [[TMP101]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add i32 [[TMP107]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add i32 [[TMP110]], [[BIN_RDX37]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP101]], [[TMP104]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add i32 [[TMP107]], [[TMP110]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add i32 [[BIN_RDX]], [[BIN_RDX37]] ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll @@ -30,8 +30,8 @@ ; UF3-NEXT: br i1 [[EC]], label %middle.block, label %vector.body ; ; UF3-LABEL: middle.block: -; UF3-NEXT: [[RDX0:%.+]] = add <4 x i32> [[SUM1_NEXT]], [[SUM0_NEXT]] -; UF3-NEXT: [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[RDX0]] +; UF3-NEXT: [[RDX0:%.+]] = add <4 x i32> [[SUM0_NEXT]], [[SUM1_NEXT]] +; UF3-NEXT: [[RDX1:%.+]] = add <4 x i32> [[RDX0]], [[SUM2_NEXT]] ; UF3-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX1]]) ; @@ -77,10 +77,10 @@ ; UF5-NEXT: br i1 [[EC]], label %middle.block, label %vector.body ; ; UF5-LABEL: middle.block: -; UF5-NEXT: [[RDX0:%.+]] = add <4 x i32> [[SUM1_NEXT]], [[SUM0_NEXT]] -; UF5-NEXT: [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[RDX0]] -; UF5-NEXT: [[RDX2:%.+]] = add <4 x i32> [[SUM3_NEXT]], [[RDX1]] -; UF5-NEXT: [[RDX3:%.+]] = add <4 x i32> [[SUM4_NEXT]], [[RDX2]] +; UF5-NEXT: [[RDX0:%.+]] = add <4 x i32> [[SUM0_NEXT]], [[SUM1_NEXT]] +; UF5-NEXT: [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[SUM3_NEXT]] +; UF5-NEXT: [[RDX2:%.+]] = add <4 x i32> [[RDX0]], [[RDX1]] +; UF5-NEXT: [[RDX3:%.+]] = add <4 x i32> [[RDX2]], [[SUM4_NEXT]] ; UF5-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX3]]) ; diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll @@ -28,7 +28,7 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[TMP37:%.*]] = trunc [[TMP34]] to ; CHECK-NEXT: [[TMP38:%.*]] = trunc [[TMP36]] to -; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP37]], [[TMP38]] ; CHECK-NEXT: [[TMP39:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8( [[BIN_RDX]]) ; CHECK-NEXT: [[TMP40:%.*]] = zext i8 [[TMP39]] to i32 ;