Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12434,7 +12434,20 @@ if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) return false; } - return ConstOrder && *ConstOrder; + if (ConstOrder && *ConstOrder) + return true; + if (!(V1->user_empty()) && !(V2->user_empty())) { + const Instruction *FirstUserOfPhi1 = + dyn_cast(*V1->user_begin()); + const Instruction *FirstUserOfPhi2 = + dyn_cast(*V2->user_begin()); + if (FirstUserOfPhi1 && FirstUserOfPhi2 && + (FirstUserOfPhi1->getParent() == FirstUserOfPhi2->getParent())) { + if (FirstUserOfPhi1->comesBefore(FirstUserOfPhi2)) + return true; + } + } + return false; }; auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { if (V1 == V2) Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll @@ -0,0 +1,113 @@ +; RUN: opt -passes=slp-vectorizer -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s + +define <4 x half> @phis(i1 %cmp1, <4 x half> %in1, <4 x half> %in2) { +; CHECK-LABEL: @phis( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x half> [[IN1]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x half> [[IN1]], i64 2 +; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[A0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x half> [[TMP0]], half [[A1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[A2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[A3]], i32 1 +; CHECK-NEXT: br i1 [[CMP:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 0 +; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x half> [[IN2]], i64 1 +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x half> [[IN2]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x half> poison, half [[B0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[B1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x half> poison, half [[B2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x half> [[TMP6]], half [[B3]], i32 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x half> [ [[TMP1]], %entry ], [ [[TMP5]], %bb0 ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x half> [ [[TMP3]], %entry ], [ [[TMP7]], %bb0 ] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x half> [[TMP8]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x half> [[TMP9]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x i32> +; CHECK-NEXT: ret <4 x half> [[TMP12]] +entry: + %a0 = extractelement <4 x half> %in1, i64 0 + %a1 = extractelement <4 x half> %in1, i64 1 + %a2 = extractelement <4 x half> %in1, i64 2 + %a3 = extractelement <4 x half> %in1, i64 3 + br i1 %cmp1, label %bb1, label %bb0 + +bb0: + %b0 = extractelement <4 x half> %in2, i64 0 + %b1 = extractelement <4 x half> %in2, i64 1 + %b2 = extractelement <4 x half> %in2, i64 2 + %b3 = extractelement <4 x half> %in2, i64 3 + br label %bb1 + +bb1: + %c0 = phi half [ %a0, %entry ], [ %b0, %bb0 ] + %c1 = phi half [ %a1, %entry ], [ %b1, %bb0 ] + %c2 = phi half [ %a2, %entry ], [ %b2, %bb0 ] + %c3 = phi half [ %a3, %entry ], [ %b3, %bb0 ] + + %o0 = insertelement <4 x half> undef, half %c0, i64 0 + %o1 = insertelement <4 x half> %o0, half %c1, i64 1 + %o2 = insertelement <4 x half> %o1, half %c2, i64 2 + %o3 = insertelement <4 x half> %o2, half %c3, i64 3 + ret <4 x half> %o3 +} + +define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2) { +; CHECK-LABEL: @phis_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x half> [[IN1]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x half> [[IN1]], i64 2 +; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[A0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x half> [[TMP0]], half [[A1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[A2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[A3]], i32 1 +; CHECK-NEXT: br i1 [[CMP:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 0 +; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x half> [[IN2]], i64 1 +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x half> [[IN2]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x half> poison, half [[B0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[B1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x half> poison, half [[B2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x half> [[TMP6]], half [[B3]], i32 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x half> [ [[TMP1]], %entry ], [ [[TMP5]], %bb0 ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x half> [ [[TMP3]], %entry ], [ [[TMP7]], %bb0 ] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x half> [[TMP8]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x half> [[TMP9]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x i32> +; CHECK-NEXT: ret <4 x half> [[TMP12]] +entry: + %a0 = extractelement <4 x half> %in1, i64 0 + %a1 = extractelement <4 x half> %in1, i64 1 + %a2 = extractelement <4 x half> %in1, i64 2 + %a3 = extractelement <4 x half> %in1, i64 3 + br i1 %cmp1, label %bb1, label %bb0 + +bb0: + %b0 = extractelement <4 x half> %in2, i64 0 + %b1 = extractelement <4 x half> %in2, i64 1 + %b2 = extractelement <4 x half> %in2, i64 2 + %b3 = extractelement <4 x half> %in2, i64 3 + br label %bb1 + +bb1: + %c3 = phi half [ %a3, %entry ], [ %b3, %bb0 ] + %c2 = phi half [ %a2, %entry ], [ %b2, %bb0 ] + %c1 = phi half [ %a1, %entry ], [ %b1, %bb0 ] + %c0 = phi half [ %a0, %entry ], [ %b0, %bb0 ] + + %o0 = insertelement <4 x half> undef, half %c0, i64 0 + %o1 = insertelement <4 x half> %o0, half %c1, i64 1 + %o2 = insertelement <4 x half> %o1, half %c2, i64 2 + %o3 = insertelement <4 x half> %o2, half %c3, i64 3 + ret <4 x half> %o3 +} Index: llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[SHUFFLE]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> ; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], @@ -32,8 +32,8 @@ ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 ; CHECK-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[ADD13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> [[TMP11]], <2 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], ; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> ; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], Index: llvm/test/Transforms/SLPVectorizer/X86/phi.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -140,22 +140,20 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 -; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] +; CHECK-NEXT: [[P_056:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD26:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x float> [ [[TMP6]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX14]], align 4 @@ -163,22 +161,25 @@ ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* ; CHECK-NEXT: [[TMP12]] = load <2 x float>, <2 x float>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], -; CHECK-NEXT: [[TMP16]] = fadd <4 x float> [[TMP6]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], +; CHECK-NEXT: [[TMP16]] = fadd <4 x float> [[TMP7]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; CHECK-NEXT: [[MUL25:%.*]] = fmul float [[TMP17]], 1.100000e+01 +; CHECK-NEXT: [[ADD26]] = fadd float [[P_056]], [[MUL25]] +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP18]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 1 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 2 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP16]], i32 3 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[ADD26]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: @@ -243,7 +244,7 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP2:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2]] = fmul <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP2]] = fmul <4 x float> [[TMP1]], ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] @@ -251,9 +252,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 ; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 ; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 ; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP6]] ; CHECK-NEXT: ret float [[ADD31]] ; Index: llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -23,42 +23,37 @@ define float @foo(float* nocapture readonly %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], -; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 +; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] +; CHECK-NEXT: [[ADD12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD12]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: