diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12492,20 +12492,7 @@ (it->getType()->isVoidTy() || isa(it))) { KeyNodes.insert(&*it); bool OpsChanged = false; - auto *SI = dyn_cast(it); - bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; - if (SI) { - auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); - // Try to vectorize chain in store, if this is the only store to the - // address in the block. - // TODO: This is just a temporarily solution to save compile time. Need - // to investigate if we can safely turn on slp-vectorize-hor-store - // instead to allow lookup for reduction chains in all non-vectorized - // stores (need to check side effects and compile time). - TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) && - SI->getValueOperand()->hasOneUse(); - } - if (TryToVectorizeRoot) { + if (ShouldStartVectorizeHorAtStore || !isa(it)) { for (auto *V : it->operand_values()) { // Postponed instructions should not be vectorized here, delay their // vectorization. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -37,13 +37,13 @@ define float @f_used_out_of_tree(<2 x float> %x) { ; CHECK-LABEL: @f_used_out_of_tree( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 +; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X0]] +; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] ; CHECK-NEXT: store float [[ADD]], float* @a, align 4 -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: ret float [[X0]] ; ; THRESH1-LABEL: @f_used_out_of_tree( ; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s -; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefixes=ALL,CHECK +; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefixes=ALL,STORE ; #include ; @@ -16,32 +16,32 @@ ; } define i32 @add_red(float* %A, i32 %n) { -; CHECK-LABEL: @add_red( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.lr.ph: -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) -; CHECK-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]] -; CHECK-NEXT: [[INC]] = add nsw i64 [[I_033]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.for.end_crit_edge: -; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32 -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; ALL-LABEL: @add_red( +; ALL-NEXT: entry: +; ALL-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ALL-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; ALL: for.body.lr.ph: +; ALL-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 +; ALL-NEXT: br label [[FOR_BODY:%.*]] +; ALL: for.body: +; ALL-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2 +; ALL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] +; ALL-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; ALL-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; ALL-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], +; ALL-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; ALL-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]] +; ALL-NEXT: [[INC]] = add nsw i64 [[I_033]], 1 +; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] +; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] +; ALL: for.cond.for.end_crit_edge: +; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32 +; ALL-NEXT: br label [[FOR_END]] +; ALL: for.end: +; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: %cmp31 = icmp sgt i32 %n, 0 @@ -99,34 +99,34 @@ ; } define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { -; CHECK-LABEL: @mul_red( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.lr.ph: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]] -; CHECK-NEXT: [[INC]] = add nsw i64 [[I_040]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.for.end_crit_edge: -; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32 -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; ALL-LABEL: @mul_red( +; ALL-NEXT: entry: +; ALL-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ALL-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; ALL: for.body.lr.ph: +; ALL-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; ALL-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; ALL-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 +; ALL-NEXT: br label [[FOR_BODY:%.*]] +; ALL: for.body: +; ALL-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2 +; ALL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* +; ALL-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; ALL-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; ALL-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]] +; ALL-NEXT: [[INC]] = add nsw i64 [[I_040]], 1 +; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] +; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] +; ALL: for.cond.for.end_crit_edge: +; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32 +; ALL-NEXT: br label [[FOR_END]] +; ALL: for.end: +; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: %cmp38 = icmp sgt i32 %n, 0 @@ -196,41 +196,41 @@ ; } define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { -; CHECK-LABEL: @long_red( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.lr.ph: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <8 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]] -; CHECK-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[MUL49]] -; CHECK-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]] -; CHECK-NEXT: [[INC]] = add nsw i64 [[I_083]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.for.end_crit_edge: -; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32 -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; ALL-LABEL: @long_red( +; ALL-NEXT: entry: +; ALL-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ALL-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; ALL: for.body.lr.ph: +; ALL-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <8 x float>* +; ALL-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 +; ALL-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 +; ALL-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4 +; ALL-NEXT: [[TMP3:%.*]] = sext i32 [[N]] to i64 +; ALL-NEXT: br label [[FOR_BODY:%.*]] +; ALL: for.body: +; ALL-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6 +; ALL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] +; ALL-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>* +; ALL-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 +; ALL-NEXT: [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]] +; ALL-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8 +; ALL-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]] +; ALL-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4 +; ALL-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]] +; ALL-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]]) +; ALL-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[MUL49]] +; ALL-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]] +; ALL-NEXT: [[INC]] = add nsw i64 [[I_083]], 1 +; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]] +; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] +; ALL: for.cond.for.end_crit_edge: +; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32 +; ALL-NEXT: br label [[FOR_END]] +; ALL: for.end: +; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: %cmp81 = icmp sgt i32 %n, 0 @@ -330,34 +330,34 @@ ; } define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { -; CHECK-LABEL: @chain_red( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.lr.ph: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX]] = fadd fast float [[TMP6]], [[SUM_042]] -; CHECK-NEXT: [[INC]] = add nsw i64 [[I_043]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.for.end_crit_edge: -; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32 -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; ALL-LABEL: @chain_red( +; ALL-NEXT: entry: +; ALL-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ALL-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; ALL: for.body.lr.ph: +; ALL-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; ALL-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; ALL-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 +; ALL-NEXT: br label [[FOR_BODY:%.*]] +; ALL: for.body: +; ALL-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] +; ALL-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2 +; ALL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* +; ALL-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; ALL-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; ALL-NEXT: [[OP_RDX]] = fadd fast float [[TMP6]], [[SUM_042]] +; ALL-NEXT: [[INC]] = add nsw i64 [[I_043]], 1 +; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] +; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] +; ALL: for.cond.for.end_crit_edge: +; ALL-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32 +; ALL-NEXT: br label [[FOR_END]] +; ALL: for.end: +; ALL-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; ALL-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: %cmp41 = icmp sgt i32 %n, 0 @@ -437,65 +437,65 @@ ; } define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] -; CHECK: for.body16.lr.ph: -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 -; CHECK-NEXT: br label [[FOR_BODY16:%.*]] -; CHECK: for.cond.cleanup15: -; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 -; CHECK-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] -; CHECK: for.body16: -; CHECK-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 -; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 -; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] -; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] -; CHECK-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 -; CHECK-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 -; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 -; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 -; CHECK-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 -; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] -; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] -; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] -; CHECK-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] -; CHECK-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] +; ALL-LABEL: @foo( +; ALL-NEXT: entry: +; ALL-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0 +; ALL-NEXT: br label [[FOR_BODY:%.*]] +; ALL: for.cond.cleanup: +; ALL-NEXT: ret void +; ALL: for.body: +; ALL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] +; ALL-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 +; ALL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] +; ALL-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; ALL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 +; ALL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] +; ALL-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; ALL-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 +; ALL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] +; ALL-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 +; ALL-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 +; ALL-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] +; ALL-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; ALL-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] +; ALL: for.body16.lr.ph: +; ALL-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] +; ALL-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 +; ALL-NEXT: br label [[FOR_BODY16:%.*]] +; ALL: for.cond.cleanup15: +; ALL-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 +; ALL-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 +; ALL-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 +; ALL-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4 +; ALL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ALL-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 +; ALL-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; ALL: for.body16: +; ALL-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] +; ALL-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] +; ALL-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 +; ALL-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 +; ALL-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] +; ALL-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] +; ALL-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 +; ALL-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 +; ALL-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 +; ALL-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 +; ALL-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 +; ALL-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] +; ALL-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] +; ALL-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] +; ALL-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] +; ALL-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 +; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] +; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] ; entry: %cmp1495 = icmp eq i32 %arg_B, 0 @@ -576,20 +576,22 @@ ; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.lr.ph: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[ADD16:%.*]] = or i64 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]] ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] ; CHECK-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 @@ -598,6 +600,33 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; STORE-LABEL: @store_red_double( +; STORE-NEXT: entry: +; STORE-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; STORE-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; STORE: for.body.lr.ph: +; STORE-NEXT: [[TMP0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* +; STORE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; STORE-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 +; STORE-NEXT: br label [[FOR_BODY:%.*]] +; STORE: for.body: +; STORE-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2 +; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]] +; STORE-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* +; STORE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 +; STORE-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]] +; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]] +; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] +; STORE-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 +; STORE-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 +; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] +; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] +; STORE: for.end: +; STORE-NEXT: ret void +; entry: %cmp17 = icmp sgt i32 %n, 0 br i1 %cmp17, label %for.body.lr.ph, label %for.end @@ -647,20 +676,38 @@ ; CHECK-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: store float [[TMP6]], float* [[C_ADDR_038]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ADD34:%.*]] = or i64 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[MUL3]], [[MUL7]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ADD1135:%.*]] = or i64 [[MUL]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[ADD14:%.*]] = fadd fast float [[ADD8]], [[MUL13]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[ADD1736:%.*]] = or i64 [[MUL]], 3 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[ADD20:%.*]] = fadd fast float [[ADD14]], [[MUL19]] +; CHECK-NEXT: store float [[ADD20]], float* [[C_ADDR_038]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_039]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] @@ -668,6 +715,32 @@ ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; +; STORE-LABEL: @store_red( +; STORE-NEXT: entry: +; STORE-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; STORE-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; STORE: for.body.lr.ph: +; STORE-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 +; STORE-NEXT: br label [[FOR_BODY:%.*]] +; STORE: for.body: +; STORE-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; STORE-NEXT: [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] +; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2 +; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]] +; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* +; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]] +; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; STORE-NEXT: store float [[TMP6]], float* [[C_ADDR_038]], align 4 +; STORE-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1 +; STORE-NEXT: [[INC]] = add nsw i64 [[I_039]], 1 +; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] +; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] +; STORE: for.end: +; STORE-NEXT: ret i32 0 +; entry: %cmp37 = icmp sgt i32 %n, 0 br i1 %cmp37, label %for.body.lr.ph, label %for.end @@ -721,11 +794,23 @@ define void @float_red_example4(float* %res) { ; CHECK-LABEL: @float_red_example4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]]) -; CHECK-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] +; CHECK-NEXT: store float [[ADD_2]], float* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @float_red_example4( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]]) +; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 @@ -741,11 +826,31 @@ define void @float_red_example8(float* %res) { ; CHECK-LABEL: @float_red_example8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) -; CHECK-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 +; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]] +; CHECK-NEXT: store float [[ADD_6]], float* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @float_red_example8( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 @@ -769,11 +874,47 @@ define void @float_red_example16(float* %res) { ; CHECK-LABEL: @float_red_example16( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]]) -; CHECK-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8 +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4 +; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16 +; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP8]], [[ADD_6]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4 +; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP9]], [[ADD_7]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8 +; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP10]], [[ADD_8]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4 +; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP11]], [[ADD_9]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16 +; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP12]], [[ADD_10]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4 +; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP13]], [[ADD_11]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8 +; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP14]], [[ADD_12]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4 +; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP15]], [[ADD_13]] +; CHECK-NEXT: store float [[ADD_14]], float* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @float_red_example16( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]]) +; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 @@ -813,11 +954,23 @@ define void @i32_red_example4(i32* %res) { ; CHECK-LABEL: @i32_red_example4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) -; CHECK-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] +; CHECK-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @i32_red_example4( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) +; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -833,11 +986,31 @@ define void @i32_red_example8(i32* %res) { ; CHECK-LABEL: @i32_red_example8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) -; CHECK-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 +; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 +; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] +; CHECK-NEXT: store i32 [[ADD_6]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @i32_red_example8( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -861,11 +1034,47 @@ define void @i32_red_example16(i32* %res) { ; CHECK-LABEL: @i32_red_example16( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]]) -; CHECK-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 +; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 +; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 +; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 +; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 +; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 +; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 +; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 +; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 +; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 +; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]] +; CHECK-NEXT: store i32 [[ADD_14]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @i32_red_example16( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]]) +; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -905,11 +1114,79 @@ define void @i32_red_example32(i32* %res) { ; CHECK-LABEL: @i32_red_example32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]]) -; CHECK-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8 +; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4 +; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16 +; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4 +; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8 +; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4 +; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16 +; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4 +; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8 +; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4 +; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16 +; CHECK-NEXT: [[ADD_15:%.*]] = add nsw i32 [[TMP16]], [[ADD_14]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4 +; CHECK-NEXT: [[ADD_16:%.*]] = add nsw i32 [[TMP17]], [[ADD_15]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8 +; CHECK-NEXT: [[ADD_17:%.*]] = add nsw i32 [[TMP18]], [[ADD_16]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4 +; CHECK-NEXT: [[ADD_18:%.*]] = add nsw i32 [[TMP19]], [[ADD_17]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16 +; CHECK-NEXT: [[ADD_19:%.*]] = add nsw i32 [[TMP20]], [[ADD_18]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4 +; CHECK-NEXT: [[ADD_20:%.*]] = add nsw i32 [[TMP21]], [[ADD_19]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8 +; CHECK-NEXT: [[ADD_21:%.*]] = add nsw i32 [[TMP22]], [[ADD_20]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4 +; CHECK-NEXT: [[ADD_22:%.*]] = add nsw i32 [[TMP23]], [[ADD_21]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16 +; CHECK-NEXT: [[ADD_23:%.*]] = add nsw i32 [[TMP24]], [[ADD_22]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4 +; CHECK-NEXT: [[ADD_24:%.*]] = add nsw i32 [[TMP25]], [[ADD_23]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8 +; CHECK-NEXT: [[ADD_25:%.*]] = add nsw i32 [[TMP26]], [[ADD_24]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4 +; CHECK-NEXT: [[ADD_26:%.*]] = add nsw i32 [[TMP27]], [[ADD_25]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16 +; CHECK-NEXT: [[ADD_27:%.*]] = add nsw i32 [[TMP28]], [[ADD_26]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4 +; CHECK-NEXT: [[ADD_28:%.*]] = add nsw i32 [[TMP29]], [[ADD_27]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8 +; CHECK-NEXT: [[ADD_29:%.*]] = add nsw i32 [[TMP30]], [[ADD_28]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4 +; CHECK-NEXT: [[ADD_30:%.*]] = add nsw i32 [[TMP31]], [[ADD_29]] +; CHECK-NEXT: store i32 [[ADD_30]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; +; STORE-LABEL: @i32_red_example32( +; STORE-NEXT: entry: +; STORE-NEXT: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]]) +; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; STORE-NEXT: ret void +; entry: %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -981,12 +1258,12 @@ declare i32 @foobar(i32) define void @i32_red_call(i32 %val) { -; CHECK-LABEL: @i32_red_call( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) -; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) -; CHECK-NEXT: ret void +; ALL-LABEL: @i32_red_call( +; ALL-NEXT: entry: +; ALL-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 +; ALL-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; ALL-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) +; ALL-NEXT: ret void ; entry: %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 @@ -1009,18 +1286,18 @@ } define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 { -; CHECK-LABEL: @i32_red_invoke( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) -; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) -; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] -; CHECK: exception: -; CHECK-NEXT: [[CLEANUP:%.*]] = landingpad i8 -; CHECK-NEXT: cleanup -; CHECK-NEXT: br label [[NORMAL]] -; CHECK: normal: -; CHECK-NEXT: ret void +; ALL-LABEL: @i32_red_invoke( +; ALL-NEXT: entry: +; ALL-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 +; ALL-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; ALL-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) +; ALL-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] +; ALL: exception: +; ALL-NEXT: [[CLEANUP:%.*]] = landingpad i8 +; ALL-NEXT: cleanup +; ALL-NEXT: br label [[NORMAL]] +; ALL: normal: +; ALL-NEXT: ret void ; entry: %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 @@ -1048,17 +1325,17 @@ ; Test case from PR47670. Reduction result is used as incoming value in phi. define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroext %b) { -; CHECK-LABEL: @reduction_result_used_in_phi( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] -; CHECK: bb: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] -; CHECK-NEXT: ret i32 [[SUM_1]] +; ALL-LABEL: @reduction_result_used_in_phi( +; ALL-NEXT: entry: +; ALL-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] +; ALL: bb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>* +; ALL-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; ALL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; ALL-NEXT: br label [[EXIT]] +; ALL: exit: +; ALL-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] +; ALL-NEXT: ret i32 [[SUM_1]] ; entry: br i1 %b, label %bb, label %exit @@ -1082,17 +1359,17 @@ } define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) { -; CHECK-LABEL: @reduction_result_used_in_phi_loop( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] -; CHECK: bb: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] -; CHECK-NEXT: ret i32 [[SUM_1]] +; ALL-LABEL: @reduction_result_used_in_phi_loop( +; ALL-NEXT: entry: +; ALL-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] +; ALL: bb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>* +; ALL-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; ALL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; ALL-NEXT: br label [[EXIT]] +; ALL: exit: +; ALL-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] +; ALL-NEXT: ret i32 [[SUM_1]] ; entry: br i1 %b, label %bb, label %exit @@ -1118,15 +1395,15 @@ ; Make sure we do not crash or infinite loop on ill-formed IR. define void @unreachable_block() { -; CHECK-LABEL: @unreachable_block( -; CHECK-NEXT: bb.0: -; CHECK-NEXT: br label [[BB_1:%.*]] -; CHECK: dead: -; CHECK-NEXT: [[T0:%.*]] = add i16 [[T0]], undef -; CHECK-NEXT: br label [[BB_1]] -; CHECK: bb.1: -; CHECK-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ] -; CHECK-NEXT: ret void +; ALL-LABEL: @unreachable_block( +; ALL-NEXT: bb.0: +; ALL-NEXT: br label [[BB_1:%.*]] +; ALL: dead: +; ALL-NEXT: [[T0:%.*]] = add i16 [[T0]], undef +; ALL-NEXT: br label [[BB_1]] +; ALL: bb.1: +; ALL-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ] +; ALL-NEXT: ret void ; bb.0: br label %bb.1 @@ -1143,11 +1420,11 @@ ; The FMF on the reduction should match the incoming insts. define float @fadd_v4f32_fmf(float* %p) { -; CHECK-LABEL: @fadd_v4f32_fmf( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) -; CHECK-NEXT: ret float [[TMP3]] +; ALL-LABEL: @fadd_v4f32_fmf( +; ALL-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; ALL-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; ALL-NEXT: [[TMP3:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) +; ALL-NEXT: ret float [[TMP3]] ; %p1 = getelementptr inbounds float, float* %p, i64 1 %p2 = getelementptr inbounds float, float* %p, i64 2 @@ -1167,11 +1444,11 @@ ; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags. define float @fadd_v4f32_fmf_intersect(float* %p) { -; CHECK-LABEL: @fadd_v4f32_fmf_intersect( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) -; CHECK-NEXT: ret float [[TMP3]] +; ALL-LABEL: @fadd_v4f32_fmf_intersect( +; ALL-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; ALL-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; ALL-NEXT: [[TMP3:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) +; ALL-NEXT: ret float [[TMP3]] ; %p1 = getelementptr inbounds float, float* %p, i64 1 %p2 = getelementptr inbounds float, float* %p, i64 2 @@ -1190,19 +1467,24 @@ define void @nsw_propagation_v4i32(i32* %res, i32 %start) { ; CHECK-LABEL: @nsw_propagation_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]] -; CHECK-NEXT: store i32 [[OP_RDX]], i32* [[RES:%.*]], align 16 +; CHECK-NEXT: [[T0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[T1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; CHECK-NEXT: [[T2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; CHECK-NEXT: [[T3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; CHECK-NEXT: [[S:%.*]] = add nsw i32 [[START:%.*]], [[T0]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[T1]], [[S]] +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[T2]], [[ADD]] +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[T3]], [[ADD_1]] +; CHECK-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; - ; STORE-LABEL: @nsw_propagation_v4i32( ; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 ; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; STORE-NEXT: [[OP_RDX:%.*]] = add i32 [[START:%.*]], [[TMP2]] +; STORE-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]] ; STORE-NEXT: store i32 [[OP_RDX]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void +; %t0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 %t1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 %t2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8