diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3551,10 +3551,10 @@ // vectorized tree. if (areAllUsersVectorized(cast(V)) && !ScalarToTreeEntry.count(V)) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); + auto *EE = cast(V); + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } } return ReuseShuffleCost + Cost; @@ -3582,11 +3582,10 @@ unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(VL[I])->getIndexOperand()); - Idx = IO->getZExtValue(); + auto *EE = cast(VL[I]); ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + Instruction::ExtractElement, EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -3596,14 +3595,15 @@ Idx = ReuseShuffleNumbers; for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Idx = IO->getZExtValue(); + auto *EE = cast(V); + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { --Idx; + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } CommonCost = ReuseShuffleCost; } else if (!E->ReorderIndices.empty()) { @@ -3613,28 +3613,37 @@ TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *EI = cast(VL[I]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(EI)) { - // Take credit for instruction that will become dead. - if (EI->hasOneUse()) { - Instruction *Ext = EI->user_back(); - if ((isa(Ext) || isa(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. + auto *EI = cast(VL[I]); + // Take credit for instruction that will become dead. + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(EI); + CommonCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(), + *getExtractIndex(EE)); + } else { CommonCost -= TTI->getExtractWithExtendCost( Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; } + // Add back the cost of s|zext which is subtracted separately. + CommonCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EI->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; } + } + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(EI); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); + } else { CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -145,22 +145,21 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 +; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -145,22 +145,21 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 +; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -110,22 +110,22 @@ ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( @@ -425,10 +425,10 @@ ; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 ; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; CHECK-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -110,22 +110,22 @@ ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -46,13 +46,13 @@ ; CHECK-NEXT: ret float [[X0]] ; ; THRESH1-LABEL: @f_used_out_of_tree( -; THRESH1-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; THRESH1-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 -; THRESH1-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X0]] -; THRESH1-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] +; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 +; THRESH1-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]] +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH1-NEXT: store float [[ADD]], float* @a, align 4 -; THRESH1-NEXT: ret float [[X0]] +; THRESH1-NEXT: ret float [[TMP1]] ; ; THRESH2-LABEL: @f_used_out_of_tree( ; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0