Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6456,7 +6456,7 @@ // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < 4) + if (NumReducedVals < 2) return false; unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); @@ -6484,7 +6484,7 @@ SmallVector IgnoreList; for (auto &V : ReductionOps) IgnoreList.append(V.begin(), V.end()); - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 1) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); Index: llvm/test/Feature/weak_constant.ll =================================================================== --- llvm/test/Feature/weak_constant.ll +++ llvm/test/Feature/weak_constant.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -O3 -S > %t -; RUN: grep undef %t | count 1 +; RUN: grep undef %t | count 2 ; RUN: grep 5 %t | count 1 ; RUN: grep 7 %t | count 1 ; RUN: grep 9 %t | count 1 Index: llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -6,7 +6,7 @@ %structA = type { [2 x float] } define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) { -; CHECK-LABEL: @test1( +; CHECK-LABEL: define {{[^@]+}}@test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 @@ -19,10 +19,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP6]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP7]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: ; CHECK-NEXT: ret void @@ -51,7 +51,7 @@ } define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) { -; CHECK-LABEL: @test2( +; CHECK-LABEL: define {{[^@]+}}@test2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 @@ -64,10 +64,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP6]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP7]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: ; CHECK-NEXT: ret void Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -9,7 +9,7 @@ @a = common global [80 x i8] zeroinitializer, align 16 define void @PR28330(i32 %n) { -; DEFAULT-LABEL: @PR28330( +; DEFAULT-LABEL: define {{[^@]+}}@PR28330( ; DEFAULT-NEXT: entry: ; DEFAULT-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 ; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer @@ -21,7 +21,7 @@ ; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]] ; DEFAULT-NEXT: br label [[FOR_BODY]] ; -; GATHER-LABEL: @PR28330( +; GATHER-LABEL: define {{[^@]+}}@PR28330( ; GATHER-NEXT: entry: ; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 ; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer @@ -65,7 +65,7 @@ ; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]] ; GATHER-NEXT: br label [[FOR_BODY]] ; -; MAX-COST-LABEL: @PR28330( +; MAX-COST-LABEL: define {{[^@]+}}@PR28330( ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 ; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 @@ -145,7 +145,7 @@ } define void @PR32038(i32 %n) { -; DEFAULT-LABEL: @PR32038( +; DEFAULT-LABEL: define {{[^@]+}}@PR32038( ; DEFAULT-NEXT: entry: ; DEFAULT-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 ; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer @@ -157,7 +157,7 @@ ; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5 ; DEFAULT-NEXT: br label [[FOR_BODY]] ; -; GATHER-LABEL: @PR32038( +; GATHER-LABEL: define {{[^@]+}}@PR32038( ; GATHER-NEXT: entry: ; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 ; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer @@ -201,7 +201,7 @@ ; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], -5 ; GATHER-NEXT: br label [[FOR_BODY]] ; -; MAX-COST-LABEL: @PR32038( +; MAX-COST-LABEL: define {{[^@]+}}@PR32038( ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 ; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer @@ -209,10 +209,8 @@ ; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 ; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 ; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 -; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 -; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5) to <2 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> [[TMP2]], zeroinitializer ; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 ; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 ; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 @@ -220,19 +218,20 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> -; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> undef, i1 [[TMP4]], i32 0 +; MAX-COST-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMP6]], i32 1 +; MAX-COST-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> [[TMP7]], i1 [[P5]], i32 2 +; MAX-COST-NEXT: [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[P7]], i32 3 +; MAX-COST-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> , <2 x i32> +; MAX-COST-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; MAX-COST-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <2 x i32> +; MAX-COST-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP11]], [[RDX_SHUF]] +; MAX-COST-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[BIN_RDX]], i32 0 +; MAX-COST-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP12]], [[TMP13]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[OP_RDX]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll @@ -14,25 +14,33 @@ ; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store. define i32 @smaxv6() { -; GFX9-LABEL: @smaxv6( +; GFX9-LABEL: define {{[^@]+}}@smaxv6( ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <2 x i32>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP4]], <2 x i32> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x i32> [[TMP7]], <2 x i32> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[OP_EXTRA]], i32 [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA]] +; GFX9-NEXT: ret i32 [[SELECT5]] ; %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -61,25 +69,33 @@ } define i64 @sminv6() { -; GFX9-LABEL: @sminv6( +; GFX9-LABEL: define {{[^@]+}}@sminv6( ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <2 x i64>*), align 16 +; GFX9-NEXT: [[LOAD5:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[TMP4]], <2 x i64> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = icmp slt <2 x i64> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x i64> [[TMP7]], <2 x i64> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8 +; GFX9-NEXT: [[CMP5:%.*]] = icmp slt i64 [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i64 [[OP_EXTRA]], i64 [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], i64 3, i64 4 ; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8 -; GFX9-NEXT: ret i64 [[OP_EXTRA]] +; GFX9-NEXT: ret i64 [[SELECT5]] ; %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16 %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8 @@ -108,25 +124,33 @@ } define float @fmaxv6() { -; GFX9-LABEL: @fmaxv6( +; GFX9-LABEL: define {{[^@]+}}@fmaxv6( ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <2 x float>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x float> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x float> [[TMP4]], <2 x float> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = fcmp fast ogt <2 x float> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], float [[TMP13]], float [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[OP_EXTRA]], float [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], float 3.000000e+00, float 4.000000e+00 ; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8 -; GFX9-NEXT: ret float [[OP_EXTRA]] +; GFX9-NEXT: ret float [[SELECT5]] ; %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16 %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4 @@ -155,25 +179,33 @@ } define double @dminv6() { -; GFX9-LABEL: @dminv6( +; GFX9-LABEL: define {{[^@]+}}@dminv6( ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <2 x double>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[TMP4]], <2 x double> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x double> undef, double [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = fcmp fast olt <2 x double> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x double> [[TMP7]], <2 x double> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = fcmp fast olt double [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], double [[TMP13]], double [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[OP_EXTRA]], double [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], double 3.000000e+00, double 4.000000e+00 ; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8 -; GFX9-NEXT: ret double [[OP_EXTRA]] +; GFX9-NEXT: ret double [[SELECT5]] ; %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16 %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4 @@ -202,25 +234,32 @@ } define i32 @smax_wdiff_valuenum(i32, i32 %v1) { -; GFX9-LABEL: @smax_wdiff_valuenum( +; GFX9-LABEL: define {{[^@]+}}@smax_wdiff_valuenum( ; GFX9-NEXT: [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 ; GFX9-NEXT: [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 -; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]] -; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]] -; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]] -; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <2 x i32>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[TMP3]], i32 0 +; GFX9-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ELT1]], i32 1 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[V1:%.*]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP5]], [[TMP7]] +; GFX9-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i32> [[TMP5]], <2 x i32> [[TMP7]] +; GFX9-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 +; GFX9-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 +; GFX9-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP10]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP10]] +; GFX9-NEXT: [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[OP_EXTRA]], i32 [[LOAD6]] +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[TMP13]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA]] +; GFX9-NEXT: ret i32 [[SELECT5]] ; %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 %elt1 = extractelement <2 x i32> %vload, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll @@ -18,15 +18,17 @@ ; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[T1]], i64 0, i64 7 ; CHECK-NEXT: store <8 x i32> , <8 x i32>* [[T]], align 32 ; CHECK-NEXT: [[T12:%.*]] = bitcast i32* [[T2]] to i8* -; CHECK-NEXT: [[T13:%.*]] = load i32, i32* [[T4]], align 32 -; CHECK-NEXT: [[T14:%.*]] = load i32, i32* [[T5]], align 4 -; CHECK-NEXT: [[T15:%.*]] = icmp slt i32 [[T14]], [[T13]] -; CHECK-NEXT: [[T16:%.*]] = select i1 [[T15]], i32 [[T14]], i32 [[T13]] -; CHECK-NEXT: [[T17:%.*]] = zext i1 [[T15]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[T4]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 32 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i32> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP1]], <2 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: [[T17:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[T18:%.*]] = load i32, i32* [[T6]], align 8 -; CHECK-NEXT: [[T19:%.*]] = icmp slt i32 [[T18]], [[T16]] -; CHECK-NEXT: [[T20:%.*]] = select i1 [[T19]], i32 [[T18]], i32 [[T16]] -; CHECK-NEXT: [[T21:%.*]] = select i1 [[T19]], i32 2, i32 [[T16]] +; CHECK-NEXT: [[T19:%.*]] = icmp slt i32 [[T18]], [[TMP2]] +; CHECK-NEXT: [[T20:%.*]] = select i1 [[T19]], i32 [[T18]], i32 [[TMP2]] +; CHECK-NEXT: [[T21:%.*]] = select i1 [[T19]], i32 2, i32 [[TMP2]] ; CHECK-NEXT: [[T22:%.*]] = load i32, i32* [[T7]], align 4 ; CHECK-NEXT: [[T23:%.*]] = icmp slt i32 [[T22]], [[T20]] ; CHECK-NEXT: [[T24:%.*]] = select i1 [[T23]], i32 [[T22]], i32 [[T20]] Index: llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll @@ -9,16 +9,16 @@ target triple = "i686-unknown-linux-gnu" define void @vectorize_fp128(fp128 %c, fp128 %d) #0 { -; CHECK-LABEL: @vectorize_fp128( +; CHECK-LABEL: define {{[^@]+}}@vectorize_fp128( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x fp128> undef, fp128 [[C:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x fp128> [[TMP0]], fp128 [[D:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x fp128> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: [[OR_COND39:%.*]] = or i1 [[TMP4]], [[TMP5]] -; CHECK-NEXT: br i1 [[OR_COND39]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] ; CHECK: if.then13: ; CHECK-NEXT: unreachable ; CHECK: if.end24: Index: llvm/test/Transforms/SLPVectorizer/X86/hadd.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -11,13 +11,13 @@ ; define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: @test_v2f64( +; SSE-LABEL: define {{[^@]+}}@test_v2f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; SSE-NEXT: ret <2 x double> [[TMP3]] ; -; SLM-LABEL: @test_v2f64( +; SLM-LABEL: define {{[^@]+}}@test_v2f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 ; SLM-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1 ; SLM-NEXT: [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0 @@ -28,13 +28,13 @@ ; SLM-NEXT: [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1 ; SLM-NEXT: ret <2 x double> [[R01]] ; -; AVX-LABEL: @test_v2f64( +; AVX-LABEL: define {{[^@]+}}@test_v2f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> ; AVX-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; AVX-NEXT: ret <2 x double> [[TMP3]] ; -; AVX512-LABEL: @test_v2f64( +; AVX512-LABEL: define {{[^@]+}}@test_v2f64( ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] @@ -52,7 +52,7 @@ } define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: @test_v4f32( +; CHECK-LABEL: define {{[^@]+}}@test_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] @@ -78,34 +78,13 @@ } define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) { -; SSE-LABEL: @test_v2i64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <2 x i64> [[TMP3]] -; -; SLM-LABEL: @test_v2i64( -; SLM-NEXT: [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1 -; SLM-NEXT: [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1 -; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[A1]] -; SLM-NEXT: [[R1:%.*]] = add i64 [[B0]], [[B1]] -; SLM-NEXT: [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0 -; SLM-NEXT: [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[R01]] -; -; AVX-LABEL: @test_v2i64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <2 x i64> [[TMP3]] -; -; AVX512-LABEL: @test_v2i64( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; AVX512-NEXT: ret <2 x i64> [[TMP3]] +; CHECK-LABEL: define {{[^@]+}}@test_v2i64( +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <2 x i64> [[RDX_SHUF1]], [[A]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[RDX_SHUF]], [[B]] +; CHECK-NEXT: [[R01:%.*]] = shufflevector <2 x i64> [[BIN_RDX2]], <2 x i64> [[BIN_RDX]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[R01]] ; %a0 = extractelement <2 x i64> %a, i32 0 %a1 = extractelement <2 x i64> %a, i32 1 @@ -119,7 +98,7 @@ } define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: @test_v4i32( +; CHECK-LABEL: define {{[^@]+}}@test_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] @@ -145,7 +124,7 @@ } define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: @test_v8i16( +; CHECK-LABEL: define {{[^@]+}}@test_v8i16( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] @@ -191,7 +170,7 @@ ; define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { -; SSE-LABEL: @test_v4f64( +; SSE-LABEL: define {{[^@]+}}@test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] @@ -201,7 +180,7 @@ ; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> ; SSE-NEXT: ret <4 x double> [[R03]] ; -; SLM-LABEL: @test_v4f64( +; SLM-LABEL: define {{[^@]+}}@test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] @@ -211,13 +190,13 @@ ; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> ; SLM-NEXT: ret <4 x double> [[R03]] ; -; AVX-LABEL: @test_v4f64( +; AVX-LABEL: define {{[^@]+}}@test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] ; AVX-NEXT: ret <4 x double> [[TMP3]] ; -; AVX512-LABEL: @test_v4f64( +; AVX512-LABEL: define {{[^@]+}}@test_v4f64( ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] @@ -243,7 +222,7 @@ } define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { -; SSE-LABEL: @test_v8f32( +; SSE-LABEL: define {{[^@]+}}@test_v8f32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] @@ -253,7 +232,7 @@ ; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> ; SSE-NEXT: ret <8 x float> [[R07]] ; -; SLM-LABEL: @test_v8f32( +; SLM-LABEL: define {{[^@]+}}@test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] @@ -263,13 +242,13 @@ ; SLM-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> ; SLM-NEXT: ret <8 x float> [[R07]] ; -; AVX-LABEL: @test_v8f32( +; AVX-LABEL: define {{[^@]+}}@test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] ; AVX-NEXT: ret <8 x float> [[TMP3]] ; -; AVX512-LABEL: @test_v8f32( +; AVX512-LABEL: define {{[^@]+}}@test_v8f32( ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] @@ -311,7 +290,7 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; SSE-LABEL: @test_v4i64( +; SSE-LABEL: define {{[^@]+}}@test_v4i64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] @@ -321,7 +300,7 @@ ; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> ; SSE-NEXT: ret <4 x i64> [[R03]] ; -; SLM-LABEL: @test_v4i64( +; SLM-LABEL: define {{[^@]+}}@test_v4i64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SLM-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] @@ -331,13 +310,13 @@ ; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> ; SLM-NEXT: ret <4 x i64> [[R03]] ; -; AVX-LABEL: @test_v4i64( +; AVX-LABEL: define {{[^@]+}}@test_v4i64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> ; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] ; AVX-NEXT: ret <4 x i64> [[TMP3]] ; -; AVX512-LABEL: @test_v4i64( +; AVX512-LABEL: define {{[^@]+}}@test_v4i64( ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] @@ -363,7 +342,7 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @test_v8i32( +; SSE-LABEL: define {{[^@]+}}@test_v8i32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] @@ -373,7 +352,7 @@ ; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R07]] ; -; SLM-LABEL: @test_v8i32( +; SLM-LABEL: define {{[^@]+}}@test_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] @@ -383,13 +362,13 @@ ; SLM-NEXT: [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R07]] ; -; AVX-LABEL: @test_v8i32( +; AVX-LABEL: define {{[^@]+}}@test_v8i32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> ; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] ; AVX-NEXT: ret <8 x i32> [[TMP3]] ; -; AVX512-LABEL: @test_v8i32( +; AVX512-LABEL: define {{[^@]+}}@test_v8i32( ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] @@ -431,7 +410,7 @@ } define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { -; SSE-LABEL: @test_v16i16( +; SSE-LABEL: define {{[^@]+}}@test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] @@ -441,19 +420,19 @@ ; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; -; SLM-LABEL: @test_v16i16( +; SLM-LABEL: define {{[^@]+}}@test_v16i16( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; SLM-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] ; SLM-NEXT: ret <16 x i16> [[TMP3]] ; -; AVX-LABEL: @test_v16i16( +; AVX-LABEL: define {{[^@]+}}@test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; AVX-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] ; AVX-NEXT: ret <16 x i16> [[TMP3]] ; -; AVX512-LABEL: @test_v16i16( +; AVX512-LABEL: define {{[^@]+}}@test_v16i16( ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -8,7 +8,7 @@ @res = external local_unnamed_addr global float, align 4 define float @baz() { -; CHECK-LABEL: @baz( +; CHECK-LABEL: define {{[^@]+}}@baz( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 @@ -35,30 +35,33 @@ ; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 ; CHECK-NEXT: ret float [[ADD19_3]] ; -; THRESHOLD-LABEL: @baz( +; THRESHOLD-LABEL: define {{[^@]+}}@baz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] -; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP7]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; THRESHOLD-NEXT: [[TMP10:%.*]] = fadd fast float [[TMP9]], [[TMP8]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP10]], [[MUL4_1]] +; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[MUL4]] +; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], [[ADD_1]] +; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP11]], [[OP_EXTRA3]] ; THRESHOLD-NEXT: store float [[ADD19_3]], float* @res, align 4 ; THRESHOLD-NEXT: ret float [[ADD19_3]] ; @@ -92,7 +95,7 @@ } define float @bazz() { -; CHECK-LABEL: @bazz( +; CHECK-LABEL: define {{[^@]+}}@bazz( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 @@ -114,7 +117,7 @@ ; CHECK-NEXT: store float [[OP_EXTRA5]], float* @res, align 4 ; CHECK-NEXT: ret float [[OP_EXTRA5]] ; -; THRESHOLD-LABEL: @bazz( +; THRESHOLD-LABEL: define {{[^@]+}}@bazz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 @@ -180,7 +183,7 @@ } define float @bazzz() { -; CHECK-LABEL: @bazzz( +; CHECK-LABEL: define {{[^@]+}}@bazzz( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float @@ -196,7 +199,7 @@ ; CHECK-NEXT: store float [[TMP5]], float* @res, align 4 ; CHECK-NEXT: ret float [[TMP5]] ; -; THRESHOLD-LABEL: @bazzz( +; THRESHOLD-LABEL: define {{[^@]+}}@bazzz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float @@ -236,7 +239,7 @@ } define i32 @foo() { -; CHECK-LABEL: @foo( +; CHECK-LABEL: define {{[^@]+}}@foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float @@ -253,7 +256,7 @@ ; CHECK-NEXT: store i32 [[CONV4]], i32* @n, align 4 ; CHECK-NEXT: ret i32 [[CONV4]] ; -; THRESHOLD-LABEL: @foo( +; THRESHOLD-LABEL: define {{[^@]+}}@foo( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float @@ -295,7 +298,7 @@ } define float @bar() { -; CHECK-LABEL: @bar( +; CHECK-LABEL: define {{[^@]+}}@bar( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 @@ -310,7 +313,7 @@ ; CHECK-NEXT: store float [[TMP3]], float* @res, align 4 ; CHECK-NEXT: ret float [[TMP3]] ; -; THRESHOLD-LABEL: @bar( +; THRESHOLD-LABEL: define {{[^@]+}}@bar( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 @@ -349,7 +352,7 @@ } define float @f(float* nocapture readonly %x) { -; CHECK-LABEL: @f( +; CHECK-LABEL: define {{[^@]+}}@f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 @@ -425,7 +428,7 @@ ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret float [[OP_RDX]] ; -; THRESHOLD-LABEL: @f( +; THRESHOLD-LABEL: define {{[^@]+}}@f( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 @@ -648,7 +651,7 @@ } define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) { -; CHECK-LABEL: @f1( +; CHECK-LABEL: define {{[^@]+}}@f1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float @@ -699,7 +702,7 @@ ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA]] ; -; THRESHOLD-LABEL: @f1( +; THRESHOLD-LABEL: define {{[^@]+}}@f1( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float @@ -852,7 +855,7 @@ } define float @loadadd31(float* nocapture readonly %x) { -; CHECK-LABEL: @loadadd31( +; CHECK-LABEL: define {{[^@]+}}@loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 @@ -919,12 +922,12 @@ ; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] ; CHECK-NEXT: ret float [[TMP12]] ; -; THRESHOLD-LABEL: @loadadd31( +; THRESHOLD-LABEL: define {{[^@]+}}@loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 ; THRESHOLD-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 ; THRESHOLD-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 @@ -982,9 +985,11 @@ ; THRESHOLD-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] ; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 ; THRESHOLD-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; THRESHOLD-NEXT: ret float [[TMP12]] +; THRESHOLD-NEXT: [[RDX_SHUF18:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> +; THRESHOLD-NEXT: [[BIN_RDX19:%.*]] = fadd fast <2 x float> [[TMP1]], [[RDX_SHUF18]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[BIN_RDX19]], i32 0 +; THRESHOLD-NEXT: [[OP_RDX20:%.*]] = fadd fast float [[OP_RDX17]], [[TMP11]] +; THRESHOLD-NEXT: ret float [[OP_RDX20]] ; entry: %arrayidx = getelementptr inbounds float, float* %x, i64 1 @@ -1080,7 +1085,7 @@ } define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) { -; CHECK-LABEL: @extra_args( +; CHECK-LABEL: define {{[^@]+}}@extra_args( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float @@ -1105,7 +1110,7 @@ ; CHECK-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA5]] ; -; THRESHOLD-LABEL: @extra_args( +; THRESHOLD-LABEL: define {{[^@]+}}@extra_args( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float @@ -1162,7 +1167,7 @@ } define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a, i32 %b) { -; CHECK-LABEL: @extra_args_same_several_times( +; CHECK-LABEL: define {{[^@]+}}@extra_args_same_several_times( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float @@ -1189,7 +1194,7 @@ ; CHECK-NEXT: [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA7]] ; -; THRESHOLD-LABEL: @extra_args_same_several_times( +; THRESHOLD-LABEL: define {{[^@]+}}@extra_args_same_several_times( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float @@ -1250,7 +1255,7 @@ } define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b, i32 %c) { -; CHECK-LABEL: @extra_args_no_replace( +; CHECK-LABEL: define {{[^@]+}}@extra_args_no_replace( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float @@ -1277,7 +1282,7 @@ ; CHECK-NEXT: [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA5]] ; -; THRESHOLD-LABEL: @extra_args_no_replace( +; THRESHOLD-LABEL: define {{[^@]+}}@extra_args_no_replace( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float @@ -1338,7 +1343,7 @@ } define i32 @wobble(i32 %arg, i32 %bar) { -; CHECK-LABEL: @wobble( +; CHECK-LABEL: define {{[^@]+}}@wobble( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1 @@ -1361,7 +1366,7 @@ ; CHECK-NEXT: [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]] ; CHECK-NEXT: ret i32 [[OP_EXTRA3]] ; -; THRESHOLD-LABEL: @wobble( +; THRESHOLD-LABEL: define {{[^@]+}}@wobble( ; THRESHOLD-NEXT: bb: ; THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0 ; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -10,7 +10,7 @@ @var = global i32 zeroinitializer, align 8 define i32 @maxi8(i32) { -; CHECK-LABEL: @maxi8( +; CHECK-LABEL: define {{[^@]+}}@maxi8( ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]] @@ -50,7 +50,7 @@ } define i32 @maxi16(i32) { -; CHECK-LABEL: @maxi16( +; CHECK-LABEL: define {{[^@]+}}@maxi16( ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]] @@ -117,7 +117,7 @@ } define i32 @maxi32(i32) { -; CHECK-LABEL: @maxi32( +; CHECK-LABEL: define {{[^@]+}}@maxi32( ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]] @@ -235,7 +235,7 @@ } define float @maxf8(float) { -; CHECK-LABEL: @maxf8( +; CHECK-LABEL: define {{[^@]+}}@maxf8( ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]] @@ -275,7 +275,7 @@ } define float @maxf16(float) { -; CHECK-LABEL: @maxf16( +; CHECK-LABEL: define {{[^@]+}}@maxf16( ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]] @@ -342,7 +342,7 @@ } define float @maxf32(float) { -; CHECK-LABEL: @maxf32( +; CHECK-LABEL: define {{[^@]+}}@maxf32( ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]] @@ -460,7 +460,7 @@ } define i32 @maxi8_mutiple_uses(i32) { -; SSE-LABEL: @maxi8_mutiple_uses( +; SSE-LABEL: define {{[^@]+}}@maxi8_mutiple_uses( ; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 ; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 ; SSE-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] @@ -485,57 +485,59 @@ ; SSE-NEXT: store i32 [[TMP15]], i32* @var, align 8 ; SSE-NEXT: ret i32 [[TMP14]] ; -; AVX-LABEL: @maxi8_mutiple_uses( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> -; AVX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] -; AVX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] +; AVX-LABEL: define {{[^@]+}}@maxi8_mutiple_uses( +; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 +; AVX-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; AVX-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF4]] +; AVX-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF4]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] ; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] -; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] -; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP5]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX-NEXT: store i32 [[TMP15]], i32* @var, align 8 -; AVX-NEXT: ret i32 [[TMP14]] +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP5]] +; AVX-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP3]] +; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP3]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 [[TMP10]] +; AVX-NEXT: [[TMP13:%.*]] = select i1 undef, i32 3, i32 4 +; AVX-NEXT: store i32 [[TMP13]], i32* @var, align 8 +; AVX-NEXT: ret i32 [[TMP12]] ; -; AVX2-LABEL: @maxi8_mutiple_uses( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] -; AVX2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] +; AVX2-LABEL: define {{[^@]+}}@maxi8_mutiple_uses( +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 +; AVX2-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; AVX2-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF4]] +; AVX2-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF4]] +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 +; AVX2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] ; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX2-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] -; AVX2-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP5]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX2-NEXT: store i32 [[TMP15]], i32* @var, align 8 -; AVX2-NEXT: ret i32 [[TMP14]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], [[TMP5]] +; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP5]] +; AVX2-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP3]] +; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP3]] +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP10]] +; AVX2-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 [[TMP10]] +; AVX2-NEXT: [[TMP13:%.*]] = select i1 undef, i32 3, i32 4 +; AVX2-NEXT: store i32 [[TMP13]], i32* @var, align 8 +; AVX2-NEXT: ret i32 [[TMP12]] ; -; SKX-LABEL: @maxi8_mutiple_uses( +; SKX-LABEL: define {{[^@]+}}@maxi8_mutiple_uses( ; SKX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 ; SKX-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 ; SKX-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 @@ -594,7 +596,7 @@ } define i32 @maxi8_wrong_parent(i32) { -; SSE-LABEL: @maxi8_wrong_parent( +; SSE-LABEL: define {{[^@]+}}@maxi8_wrong_parent( ; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 ; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 ; SSE-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] @@ -619,7 +621,7 @@ ; SSE-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]] ; SSE-NEXT: ret i32 [[OP_EXTRA]] ; -; AVX-LABEL: @maxi8_wrong_parent( +; AVX-LABEL: define {{[^@]+}}@maxi8_wrong_parent( ; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 ; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 ; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] @@ -627,24 +629,25 @@ ; AVX: pp: ; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 ; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> ; AVX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] ; AVX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] ; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; AVX-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[RDX_SHUF4]] +; AVX-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP7]], <2 x i32> [[RDX_SHUF4]] +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_RDX]], [[TMP5]] +; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[OP_RDX]], i32 [[TMP5]] ; AVX-NEXT: ret i32 [[OP_EXTRA]] ; -; AVX2-LABEL: @maxi8_wrong_parent( +; AVX2-LABEL: define {{[^@]+}}@maxi8_wrong_parent( ; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 ; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 ; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] @@ -652,24 +655,25 @@ ; AVX2: pp: ; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 ; AVX2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] ; AVX2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] ; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX2-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]] +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX2-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; AVX2-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[RDX_SHUF4]] +; AVX2-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP7]], <2 x i32> [[RDX_SHUF4]] +; AVX2-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; AVX2-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_RDX]], [[TMP5]] +; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[OP_RDX]], i32 [[TMP5]] ; AVX2-NEXT: ret i32 [[OP_EXTRA]] ; -; SKX-LABEL: @maxi8_wrong_parent( +; SKX-LABEL: define {{[^@]+}}@maxi8_wrong_parent( ; SKX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 ; SKX-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 ; SKX-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 @@ -677,29 +681,30 @@ ; SKX-NEXT: br label [[PP:%.*]] ; SKX: pp: ; SKX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; SKX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; SKX-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 ; SKX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> ; SKX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] ; SKX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] ; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; SKX-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; SKX-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; SKX-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; SKX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; SKX-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> undef, i1 [[TMP12]], i32 0 -; SKX-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1 -; SKX-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i32 0 -; SKX-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1 -; SKX-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i32 0 -; SKX-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1 -; SKX-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 -; SKX-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP20]] -; SKX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP20]] +; SKX-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; SKX-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; SKX-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[RDX_SHUF4]] +; SKX-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP7]], <2 x i32> [[RDX_SHUF4]] +; SKX-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; SKX-NEXT: [[TMP11:%.*]] = insertelement <2 x i1> undef, i1 [[TMP10]], i32 0 +; SKX-NEXT: [[TMP12:%.*]] = insertelement <2 x i1> [[TMP11]], i1 [[TMP5]], i32 1 +; SKX-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i32 0 +; SKX-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP3]], i32 1 +; SKX-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP9]], i32 0 +; SKX-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP4]], i32 1 +; SKX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP14]], <2 x i32> [[TMP16]] +; SKX-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; SKX-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 +; SKX-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], [[TMP18]] +; SKX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP18]] ; SKX-NEXT: ret i32 [[OP_EXTRA]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -732,7 +737,7 @@ ; PR38191 - We don't handle array-of-pointer reductions. define i32* @maxp8(i32) { -; SSE-LABEL: @maxp8( +; SSE-LABEL: define {{[^@]+}}@maxp8( ; SSE-NEXT: [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16 ; SSE-NEXT: [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4 ; SSE-NEXT: [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]] @@ -757,7 +762,7 @@ ; SSE-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]] ; SSE-NEXT: ret i32* [[TMP23]] ; -; AVX-LABEL: @maxp8( +; AVX-LABEL: define {{[^@]+}}@maxp8( ; AVX-NEXT: [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16 ; AVX-NEXT: [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4 ; AVX-NEXT: [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]] @@ -782,7 +787,7 @@ ; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]] ; AVX-NEXT: ret i32* [[TMP23]] ; -; AVX2-LABEL: @maxp8( +; AVX2-LABEL: define {{[^@]+}}@maxp8( ; AVX2-NEXT: [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16 ; AVX2-NEXT: [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4 ; AVX2-NEXT: [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]] @@ -807,7 +812,7 @@ ; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]] ; AVX2-NEXT: ret i32* [[TMP23]] ; -; SKX-LABEL: @maxp8( +; SKX-LABEL: define {{[^@]+}}@maxp8( ; SKX-NEXT: [[TMP2:%.*]] = load <2 x i32*>, <2 x i32*>* bitcast ([32 x i32*]* @arrp to <2 x i32*>*), align 16 ; SKX-NEXT: [[TMP3:%.*]] = extractelement <2 x i32*> [[TMP2]], i32 0 ; SKX-NEXT: [[TMP4:%.*]] = extractelement <2 x i32*> [[TMP2]], i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -16,7 +16,7 @@ ; } define i32 @add_red(float* %A, i32 %n) { -; CHECK-LABEL: @add_red( +; CHECK-LABEL: define {{[^@]+}}@add_red( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -53,7 +53,7 @@ ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; -; STORE-LABEL: @add_red( +; STORE-LABEL: define {{[^@]+}}@add_red( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -146,7 +146,7 @@ ; } define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { -; CHECK-LABEL: @mul_red( +; CHECK-LABEL: define {{[^@]+}}@mul_red( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -188,7 +188,7 @@ ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; -; STORE-LABEL: @mul_red( +; STORE-LABEL: define {{[^@]+}}@mul_red( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -298,7 +298,7 @@ ; } define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { -; CHECK-LABEL: @long_red( +; CHECK-LABEL: define {{[^@]+}}@long_red( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -361,7 +361,7 @@ ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; -; STORE-LABEL: @long_red( +; STORE-LABEL: define {{[^@]+}}@long_red( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -522,7 +522,7 @@ ; } define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { -; CHECK-LABEL: @chain_red( +; CHECK-LABEL: define {{[^@]+}}@chain_red( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -564,7 +564,7 @@ ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; -; STORE-LABEL: @chain_red( +; STORE-LABEL: define {{[^@]+}}@chain_red( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -684,7 +684,7 @@ ; } define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) { -; CHECK-LABEL: @foo( +; CHECK-LABEL: define {{[^@]+}}@foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -744,7 +744,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] ; -; STORE-LABEL: @foo( +; STORE-LABEL: define {{[^@]+}}@foo( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0 ; STORE-NEXT: br label [[FOR_BODY:%.*]] @@ -878,7 +878,7 @@ ; } define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { -; CHECK-LABEL: @store_red_double( +; CHECK-LABEL: define {{[^@]+}}@store_red_double( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -907,7 +907,7 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; -; STORE-LABEL: @store_red_double( +; STORE-LABEL: define {{[^@]+}}@store_red_double( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -926,11 +926,11 @@ ; STORE-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* ; STORE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]] -; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]] +; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <2 x i32> +; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP5]], [[RDX_SHUF]] +; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] -; STORE-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 +; STORE-NEXT: store double [[TMP6]], double* [[ARRAYIDX9]], align 8 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] @@ -981,7 +981,7 @@ ; } define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { -; CHECK-LABEL: @store_red( +; CHECK-LABEL: define {{[^@]+}}@store_red( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -1025,7 +1025,7 @@ ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; -; STORE-LABEL: @store_red( +; STORE-LABEL: define {{[^@]+}}@store_red( ; STORE-NEXT: entry: ; STORE-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] @@ -1115,7 +1115,7 @@ @arr_float = global [32 x float] zeroinitializer, align 16 define void @float_red_example4(float* %res) { -; CHECK-LABEL: @float_red_example4( +; CHECK-LABEL: define {{[^@]+}}@float_red_example4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 @@ -1127,7 +1127,7 @@ ; CHECK-NEXT: store float [[ADD_2]], float* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @float_red_example4( +; STORE-LABEL: define {{[^@]+}}@float_red_example4( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> @@ -1151,7 +1151,7 @@ } define void @float_red_example8(float* %res) { -; CHECK-LABEL: @float_red_example8( +; CHECK-LABEL: define {{[^@]+}}@float_red_example8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 @@ -1171,7 +1171,7 @@ ; CHECK-NEXT: store float [[ADD_6]], float* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @float_red_example8( +; STORE-LABEL: define {{[^@]+}}@float_red_example8( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> undef, <8 x i32> @@ -1205,7 +1205,7 @@ } define void @float_red_example16(float* %res) { -; CHECK-LABEL: @float_red_example16( +; CHECK-LABEL: define {{[^@]+}}@float_red_example16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 @@ -1241,7 +1241,7 @@ ; CHECK-NEXT: store float [[ADD_14]], float* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @float_red_example16( +; STORE-LABEL: define {{[^@]+}}@float_red_example16( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> undef, <16 x i32> @@ -1293,7 +1293,7 @@ } define void @i32_red_example4(i32* %res) { -; CHECK-LABEL: @i32_red_example4( +; CHECK-LABEL: define {{[^@]+}}@i32_red_example4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -1305,7 +1305,7 @@ ; CHECK-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @i32_red_example4( +; STORE-LABEL: define {{[^@]+}}@i32_red_example4( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> @@ -1329,7 +1329,7 @@ } define void @i32_red_example8(i32* %res) { -; CHECK-LABEL: @i32_red_example8( +; CHECK-LABEL: define {{[^@]+}}@i32_red_example8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -1349,7 +1349,7 @@ ; CHECK-NEXT: store i32 [[ADD_6]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @i32_red_example8( +; STORE-LABEL: define {{[^@]+}}@i32_red_example8( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> @@ -1383,7 +1383,7 @@ } define void @i32_red_example16(i32* %res) { -; CHECK-LABEL: @i32_red_example16( +; CHECK-LABEL: define {{[^@]+}}@i32_red_example16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -1419,7 +1419,7 @@ ; CHECK-NEXT: store i32 [[ADD_14]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @i32_red_example16( +; STORE-LABEL: define {{[^@]+}}@i32_red_example16( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> @@ -1471,7 +1471,7 @@ } define void @i32_red_example32(i32* %res) { -; CHECK-LABEL: @i32_red_example32( +; CHECK-LABEL: define {{[^@]+}}@i32_red_example32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 @@ -1539,7 +1539,7 @@ ; CHECK-NEXT: store i32 [[ADD_30]], i32* [[RES:%.*]], align 16 ; CHECK-NEXT: ret void ; -; STORE-LABEL: @i32_red_example32( +; STORE-LABEL: define {{[^@]+}}@i32_red_example32( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> @@ -1627,7 +1627,7 @@ declare i32 @foobar(i32) define void @i32_red_call(i32 %val) { -; CHECK-LABEL: @i32_red_call( +; CHECK-LABEL: define {{[^@]+}}@i32_red_call( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> @@ -1640,7 +1640,7 @@ ; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) ; CHECK-NEXT: ret void ; -; STORE-LABEL: @i32_red_call( +; STORE-LABEL: define {{[^@]+}}@i32_red_call( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> @@ -1674,7 +1674,7 @@ } define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 { -; CHECK-LABEL: @i32_red_invoke( +; CHECK-LABEL: define {{[^@]+}}@i32_red_invoke( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> @@ -1693,7 +1693,7 @@ ; CHECK: normal: ; CHECK-NEXT: ret void ; -; STORE-LABEL: @i32_red_invoke( +; STORE-LABEL: define {{[^@]+}}@i32_red_invoke( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32>