diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9158,7 +9158,9 @@ WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); - WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + // Append the recipe to the end of the VPBasicBlock because we need to + // ensure that it comes after all of it's inputs, including CondOp. + WidenRecipe->getParent()->appendRecipe(RedRecipe); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1,8 +1,10 @@ -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=true -S | FileCheck %s --check-prefix=CHECK-UNORDERED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true -hints-allow-reordering=true -S | FileCheck %s --check-prefix=CHECK-UNORDERED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefix=CHECK-ORDERED-TF define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_strict @@ -14,6 +16,17 @@ ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] ; CHECK-ORDERED: ret float %[[PHI]] +; CHECK-ORDERED-TF-LABEL: @fadd_strict +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-ORDERED-TF: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK-ORDERED-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* +; CHECK-ORDERED-TF: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI]], %[[SEL]]) +; CHECK-ORDERED-TF: for.end +; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] +; CHECK-ORDERED-TF: ret float %[[PHI]] + ; CHECK-UNORDERED-LABEL: @fadd_strict ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ] @@ -66,6 +79,30 @@ ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] ; CHECK-ORDERED: ret float %[[PHI]] +; CHECK-ORDERED-TF-LABEL: @fadd_strict_unroll +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK3:.*]] = phi +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK4:.*]] = phi +; CHECK-ORDERED-TF: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] +; CHECK-ORDERED-TF-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] +; CHECK-ORDERED-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* +; CHECK-ORDERED-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* +; CHECK-ORDERED-TF: %[[LOAD3:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* +; CHECK-ORDERED-TF: %[[LOAD4:.*]] = call @llvm.masked.load.nxv8f32.p0nxv8f32(* +; CHECK-ORDERED-TF: %[[SEL1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[LOAD1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI1]], %[[SEL1]]) +; CHECK-ORDERED-TF: %[[SEL2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[LOAD2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX1]], %[[SEL2]]) +; CHECK-ORDERED-TF: %[[SEL3:.*]] = select %[[ACTIVE_LANE_MASK3]], %[[LOAD3]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX2]], %[[SEL3]]) +; CHECK-ORDERED-TF: %[[SEL4:.*]] = select %[[ACTIVE_LANE_MASK4]], %[[LOAD4]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX4]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX3]], %[[SEL4]]) +; CHECK-ORDERED-TF: for.end +; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] +; CHECK-ORDERED-TF: ret float %[[PHI]] + ; CHECK-UNORDERED-LABEL: @fadd_strict_unroll ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ] @@ -130,13 +167,36 @@ ; CHECK-ORDERED: %[[VEC_IND:.*]] = phi [ %[[INDUCTION]], %vector.ph ], [ {{.*}}, %vector.body ] ; CHECK-ORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, %[[VEC_IND]] ; CHECK-ORDERED: %[[MGATHER1:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP1]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) -; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[MGATHER1]]) ; CHECK-ORDERED: %[[OR:.*]] = or %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) ; CHECK-ORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, %[[OR]] ; CHECK-ORDERED: %[[MGATHER2:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) ; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], %[[MGATHER2]]) -; CHECK-ORDERED: for.end -; CHECK-ORDERED: ret void +; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[MGATHER1]]) + +; CHECK-ORDERED-TF-LABEL: @fadd_strict_interleave +; CHECK-ORDERED-TF: entry +; CHECK-ORDERED-TF: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 +; CHECK-ORDERED-TF: %[[LOAD1:.*]] = load float, float* %a +; CHECK-ORDERED-TF: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] +; CHECK-ORDERED-TF: vector.ph +; CHECK-ORDERED-TF: %[[STEPVEC1:.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-ORDERED-TF: %[[STEPVEC_ADD1:.*]] = add %[[STEPVEC1]], zeroinitializer +; CHECK-ORDERED-TF: %[[STEPVEC_MUL:.*]] = mul %[[STEPVEC_ADD1]], shufflevector ( insertelement ( poison, i64 2, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[INDUCTION:.*]] = add zeroinitializer, %[[STEPVEC_MUL]] +; CHECK-ORDERED-TF: vector.body +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-ORDERED-TF: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] +; CHECK-ORDERED-TF: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] +; CHECK-ORDERED-TF: %[[VEC_IND:.*]] = phi [ %[[INDUCTION]], %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK-ORDERED-TF: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, %[[VEC_IND]] +; CHECK-ORDERED-TF: %[[MGATHER1:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP1]], i32 4, %[[ACTIVE_LANE_MASK]], undef) +; CHECK-ORDERED-TF: %[[OR:.*]] = or %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, %[[OR]] +; CHECK-ORDERED-TF: %[[MGATHER2:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[GEP2]], i32 4, %[[ACTIVE_LANE_MASK]], undef) +; CHECK-ORDERED-TF: %[[SEL2:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MGATHER2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], %[[SEL2]]) +; CHECK-ORDERED-TF: %[[SEL1:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MGATHER1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[SEL1]]) ; CHECK-UNORDERED-LABEL: @fadd_strict_interleave ; CHECK-UNORDERED: entry @@ -220,6 +280,21 @@ ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] ; CHECK-ORDERED: ret float %[[PHI]] +; CHECK-ORDERED-TF-LABEL: @fadd_of_sum +; CHECK-ORDERED-TF: vector.body +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-ORDERED-TF: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK-ORDERED-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* +; CHECK-ORDERED-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* +; CHECK-ORDERED-TF: %[[ADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] +; CHECK-ORDERED-TF: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[ADD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], %[[SEL]]) +; CHECK-ORDERED-TF: for.end.loopexit +; CHECK-ORDERED-TF: %[[EXIT_PHI:.*]] = phi float [ {{.*}}, %for.body ], [ %[[RDX]], %middle.block ] +; CHECK-ORDERED-TF: for.end +; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] +; CHECK-ORDERED-TF: ret float %[[PHI]] + ; CHECK-UNORDERED-LABEL: @fadd_of_sum ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] @@ -291,6 +366,33 @@ ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] ; CHECK-ORDERED: ret float %[[RDX_PHI]] +; CHECK-ORDERED-TF-LABEL: @fadd_conditional +; CHECK-ORDERED-TF: vector.body +; CHECK-ORDERED-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-ORDERED-TF: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK-ORDERED-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* +; CHECK-ORDERED-TF: %[[FCMP:.*]] = fcmp une %[[LOAD]], zeroinitializer +; CHECK-ORDERED-TF: %[[SELECT0:.*]] = select %[[ACTIVE_LANE_MASK]], %[[FCMP]], zeroinitializer +; CHECK-ORDERED-TF: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* {{.*}}, i32 4, %[[SELECT0]], poison) +; CHECK-ORDERED-TF: %[[XOR:.*]] = xor %[[FCMP]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[SELECT1:.*]] = select %[[ACTIVE_LANE_MASK]], %[[XOR]], zeroinitializer +; CHECK-ORDERED-TF: %[[SELECT2:.*]] = select %[[SELECT1]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer), %[[MASKED_LOAD]] +; CHECK-ORDERED-TF: %[[OR:.*]] = or %[[SELECT0]], %[[SELECT1]] +; CHECK-ORDERED-TF: %[[SELECT3:.*]] = select %[[OR]], %[[SELECT2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI]], %[[SELECT3]]) +; CHECK-ORDERED-TF: scalar.ph +; CHECK-ORDERED-TF: %[[MERGE_RDX:.*]] = phi float [ 1.000000e+00, %entry ], [ %[[RDX]], %middle.block ] +; CHECK-ORDERED-TF: for.body +; CHECK-ORDERED-TF: %[[RES:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] +; CHECK-ORDERED-TF: if.then +; CHECK-ORDERED-TF: %[[LOAD2:.*]] = load float, float* +; CHECK-ORDERED-TF: for.inc +; CHECK-ORDERED-TF: %[[PHI:.*]] = phi float [ %[[LOAD2]], %if.then ], [ 3.000000e+00, %for.body ] +; CHECK-ORDERED-TF: %[[FADD]] = fadd float %[[RES]], %[[PHI]] +; CHECK-ORDERED-TF: for.end +; CHECK-ORDERED-TF: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] +; CHECK-ORDERED-TF: ret float %[[RDX_PHI]] + ; CHECK-UNORDERED-LABEL: @fadd_conditional ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 1.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ] @@ -347,6 +449,9 @@ ; CHECK-ORDERED-LABEL: @fadd_multiple ; CHECK-ORDERED-NOT: vector.body +; CHECK-ORDERED-TF-LABEL: @fadd_multiple +; CHECK-ORDERED-TF-NOT: vector.body + ; CHECK-UNORDERED-LABEL: @fadd_multiple ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: %[[PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float -0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] @@ -415,6 +520,37 @@ ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] ; CHECK-ORDERED: ret float [[RES]] +; CHECK-ORDERED-TF-LABEL: @fmuladd_strict +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK:%.*]] = phi +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK1:%.*]] = phi +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK2:%.*]] = phi +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK3:%.*]] = phi +; CHECK-ORDERED-TF: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] +; CHECK-ORDERED-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD1:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD2:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD3:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD7:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-TF: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-TF: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-TF: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-TF: [[SEL:%.*]] = select [[ACTIVE_LANE_MASK]], [[FMUL]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[SEL]]) +; CHECK-ORDERED-TF: [[SEL1:%.*]] = select [[ACTIVE_LANE_MASK1]], [[FMUL1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[SEL1]]) +; CHECK-ORDERED-TF: [[SEL2:%.*]] = select [[ACTIVE_LANE_MASK2]], [[FMUL2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[SEL2]]) +; CHECK-ORDERED-TF: [[SEL3:%.*]] = select [[ACTIVE_LANE_MASK3]], [[FMUL3]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX3]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[SEL3]]) +; CHECK-ORDERED-TF: for.end +; CHECK-ORDERED-TF: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] +; CHECK-ORDERED-TF: ret float [[RES]] + ; CHECK-UNORDERED-LABEL: @fmuladd_strict ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] @@ -495,6 +631,37 @@ ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] ; CHECK-ORDERED: ret float [[RES]] +; CHECK-ORDERED-TF-LABEL: @fmuladd_strict_fmf +; CHECK-ORDERED-TF: vector.body: +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK:%.*]] = phi +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK1:%.*]] = phi +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK2:%.*]] = phi +; CHECK-ORDERED-TF: [[ACTIVE_LANE_MASK3:%.*]] = phi +; CHECK-ORDERED-TF: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] +; CHECK-ORDERED-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD1:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD2:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD3:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[WIDE_LOAD7:%.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-ORDERED-TF: [[FMUL:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-TF: [[FMUL1:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-TF: [[FMUL2:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-TF: [[FMUL3:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-TF: [[SEL:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[FMUL]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[SEL]]) +; CHECK-ORDERED-TF: [[SEL1:%.*]] = select nnan [[ACTIVE_LANE_MASK1]], [[FMUL1]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[SEL1]]) +; CHECK-ORDERED-TF: [[SEL2:%.*]] = select nnan [[ACTIVE_LANE_MASK2]], [[FMUL2]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[SEL2]]) +; CHECK-ORDERED-TF: [[SEL3:%.*]] = select nnan [[ACTIVE_LANE_MASK3]], [[FMUL3]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-ORDERED-TF: [[RDX3]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[SEL3]]) +; CHECK-ORDERED-TF: for.end +; CHECK-ORDERED-TF: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] +; CHECK-ORDERED-TF: ret float [[RES]] + ; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf ; CHECK-UNORDERED: vector.body ; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -263,8 +263,8 @@ ; CHECK-ORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>* ; CHECK-ORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> ; CHECK-ORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> -; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]]) ; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]]) +; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]]) ; CHECK-ORDERED: for.end ; CHECK-ORDERED: ret void