Diff 341200

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,028 Lines • ▼ Show 20 Lines	if (VectorizableTree.size() == 1 &&
VectorizableTree[0]->State == TreeEntry::Vectorize)		VectorizableTree[0]->State == TreeEntry::Vectorize)
return true;		return true;

if (VectorizableTree.size() != 2)		if (VectorizableTree.size() != 2)
return false;		return false;

// Handle splat and all-constants stores. Also try to vectorize tiny trees		// Handle splat and all-constants stores. Also try to vectorize tiny trees
// with the second gather nodes if they have less scalar operands rather than		// with the second gather nodes if they have less scalar operands rather than
// the initial tree element (may be profitable to shuffle the second gather).		// the initial tree element (may be profitable to shuffle the second gather).
		SmallVector<int> Mask;
		david-armUnsubmitted Not Done Reply Inline Actions Does the comment need updating here to reflect the change? david-arm: Does the comment need updating here to reflect the change?
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&		if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) \|\|		(allConstant(VectorizableTree[1]->Scalars) \|\|
isSplat(VectorizableTree[1]->Scalars) \|\|		isSplat(VectorizableTree[1]->Scalars) \|\|
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&		(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[1]->Scalars.size() <		VectorizableTree[1]->Scalars.size() <
VectorizableTree[0]->Scalars.size())))		VectorizableTree[0]->Scalars.size()) \|\|
		(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
		VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
		isShuffle(VectorizableTree[1]->Scalars, Mask))))
return true;		return true;

// Gathering cost would be too much for tiny trees.		// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|		if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|
VectorizableTree[1]->State == TreeEntry::NeedToGather)		VectorizableTree[1]->State == TreeEntry::NeedToGather)
return false;		return false;

return true;		return true;
▲ Show 20 Lines • Show All 4,043 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

	Show All 24 Lines
	;			;
	; NOACCELERATE-LABEL: @int_sin_4x(			; NOACCELERATE-LABEL: @int_sin_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
				RKSimonUnsubmitted Not Done Reply Inline Actions why do many of these libm vectorizations result in a v2f32 and 2 * f32 scalar calls? I'd expect either 2 x v2f32 or a v4f32. RKSimon: why do many of these libm vectorizations result in a v2f32 and 2 * f32 scalar calls? I'd expect…
				ABataevAuthorUnsubmitted Done Reply Inline Actions Cost model. Cost of 4x calls is too high (`Call cost 18 (58-40) for %1 = tail call fast float @llvm.sin.f32(float %vecext`) and the cost of 2x calls is high (`Call cost 6 (26-20) for %1 = tail call fast float @llvm.sin.f32(float %vecext)`), but the cost of the extractelements with indices 1-2 is 5 (they are removed by the vectorizer) + compensate of the costs for inserts. ABataev: Cost model. Cost of 4x calls is too high (`Call cost 18 (58-40) for %1 = tail call fast float…
				david-armUnsubmitted Not Done Reply Inline Actions I guess it is a bit difficult to follow the logic here. I think I can understand that extracting element 0 is basically free so keeping the first scalar llvm.sin.f32 makes sense I suppose? Then we decide to make a vector call for elements 1 + 2, although I can't see where they are removed by the vectoriser? It still looks like we have 4 extractelements from the original <4 x float> vector. I did try out the patch though and I can see with these changes we end up with 5 more lines of assembly in the generated code for this function, so it doesn't seem like a win to be honest. Perhaps there is an issue with the AArch64 cost model for the math calls? david-arm: I guess it is a bit difficult to follow the logic here. I think I can understand that…
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 259 Lines • ▼ Show 20 Lines
	;			;
	; NOACCELERATE-LABEL: @exp_4x(			; NOACCELERATE-LABEL: @exp_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @expf(float %vecext)			%1 = tail call fast float @expf(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines
	;			;
	; NOACCELERATE-LABEL: @log_4x(			; NOACCELERATE-LABEL: @log_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @logf(float %vecext)			%1 = tail call fast float @logf(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines
	;			;
	; NOACCELERATE-LABEL: @sin_4x(			; NOACCELERATE-LABEL: @sin_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @sinf(float %vecext)			%1 = tail call fast float @sinf(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	Show All 25 Lines
	;			;
	; NOACCELERATE-LABEL: @cos_4x(			; NOACCELERATE-LABEL: @cos_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @cosf(float %vecext)			%1 = tail call fast float @cosf(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 499 Lines • ▼ Show 20 Lines
	}			}

	; Accelerate does not provide sin() for <2 x float>.			; Accelerate does not provide sin() for <2 x float>.
	define <2 x float> @sin_2x(<2 x float>* %a) {			define <2 x float> @sin_2x(<2 x float>* %a) {
	; CHECK-LABEL: @sin_2x(			; CHECK-LABEL: @sin_2x(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) [[ATTR2:#.]]			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) [[ATTR2]]			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: ret <2 x float> [[VECINS_1]]			; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
	;			;
	; NOACCELERATE-LABEL: @sin_2x(			; NOACCELERATE-LABEL: @sin_2x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	Show All 35 Lines
	;			;
	; NOACCELERATE-LABEL: @int_cos_4x(			; NOACCELERATE-LABEL: @int_cos_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.cos.f32(float %vecext)			%1 = tail call fast float @llvm.cos.f32(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	Show All 9 Lines
	}			}

	; Accelerate does not provide cos() for <2 x float>.			; Accelerate does not provide cos() for <2 x float>.
	define <2 x float> @cos_2x(<2 x float>* %a) {			define <2 x float> @cos_2x(<2 x float>* %a) {
	; CHECK-LABEL: @cos_2x(			; CHECK-LABEL: @cos_2x(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) [[ATTR3:#.]]			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) [[ATTR3]]			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: ret <2 x float> [[VECINS_1]]			; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
	;			;
	; NOACCELERATE-LABEL: @cos_2x(			; NOACCELERATE-LABEL: @cos_2x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	Show All 16 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

	Show All 24 Lines
	;			;
	; NOACCELERATE-LABEL: @int_sin_4x(			; NOACCELERATE-LABEL: @int_sin_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 259 Lines • ▼ Show 20 Lines
	;			;
	; NOACCELERATE-LABEL: @exp_4x(			; NOACCELERATE-LABEL: @exp_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @expf(float %vecext)			%1 = tail call fast float @expf(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines
	;			;
	; NOACCELERATE-LABEL: @log_4x(			; NOACCELERATE-LABEL: @log_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @logf(float %vecext)			%1 = tail call fast float @logf(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines
	;			;
	; NOACCELERATE-LABEL: @sin_4x(			; NOACCELERATE-LABEL: @sin_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @sinf(float %vecext)			%1 = tail call fast float @sinf(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	Show All 25 Lines
	;			;
	; NOACCELERATE-LABEL: @cos_4x(			; NOACCELERATE-LABEL: @cos_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @cosf(float %vecext)			%1 = tail call fast float @cosf(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	▲ Show 20 Lines • Show All 499 Lines • ▼ Show 20 Lines
	}			}

	; Accelerate does not provide sin() for <2 x float>.			; Accelerate does not provide sin() for <2 x float>.
	define <2 x float> @sin_2x(<2 x float>* %a) {			define <2 x float> @sin_2x(<2 x float>* %a) {
	; CHECK-LABEL: @sin_2x(			; CHECK-LABEL: @sin_2x(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #2			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #2			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: ret <2 x float> [[VECINS_1]]			; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
	;			;
	; NOACCELERATE-LABEL: @sin_2x(			; NOACCELERATE-LABEL: @sin_2x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	Show All 35 Lines
	;			;
	; NOACCELERATE-LABEL: @int_cos_4x(			; NOACCELERATE-LABEL: @int_cos_4x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
	; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2			; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
				; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
				; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
				; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
				; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
				; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
	; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])			; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
	; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3			; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]			; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, <4 x float>* %a, align 16			%0 = load <4 x float>, <4 x float>* %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.cos.f32(float %vecext)			%1 = tail call fast float @llvm.cos.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	Show All 9 Lines
	}			}

	; Accelerate does not provide cos() for <2 x float>.			; Accelerate does not provide cos() for <2 x float>.
	define <2 x float> @cos_2x(<2 x float>* %a) {			define <2 x float> @cos_2x(<2 x float>* %a) {
	; CHECK-LABEL: @cos_2x(			; CHECK-LABEL: @cos_2x(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #3			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #3			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: ret <2 x float> [[VECINS_1]]			; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
	;			;
	; NOACCELERATE-LABEL: @cos_2x(			; NOACCELERATE-LABEL: @cos_2x(
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> [[A:%.*]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	Show All 16 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

	Show All 11 Lines
	; CHECK-NEXT: [[Z0:%.]] = zext <4 x i16> [[A:%.]] to <4 x i32>			; CHECK-NEXT: [[Z0:%.]] = zext <4 x i16> [[A:%.]] to <4 x i32>
	; CHECK-NEXT: [[Z1:%.]] = zext <4 x i16> [[B:%.]] to <4 x i32>			; CHECK-NEXT: [[Z1:%.]] = zext <4 x i16> [[B:%.]] to <4 x i32>
	; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]			; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
	; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0			; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
	; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64			; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64
	; CHECK-NEXT: [[GEP0:%.]] = getelementptr inbounds i64, i64 [[P:%.*]], i64 [[S0]]			; CHECK-NEXT: [[GEP0:%.]] = getelementptr inbounds i64, i64 [[P:%.*]], i64 [[S0]]
	; CHECK-NEXT: [[LOAD0:%.]] = load i64, i64 [[GEP0]], align 4			; CHECK-NEXT: [[LOAD0:%.]] = load i64, i64 [[GEP0]], align 4
	; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1			; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
	; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64
	; CHECK-NEXT: [[GEP1:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S1]]
	; CHECK-NEXT: [[LOAD1:%.]] = load i64, i64 [[GEP1]], align 4
	; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2			; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
	; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64			; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E1]], i32 0
				david-armUnsubmitted Not Done Reply Inline Actions At first glance this looks worse, but I've tried out your patch and can see the generated code is the same because the entire first sequence of inserts, sext and trunc get folded away, since the sext + trunc is basically a no-op. david-arm: At first glance this looks worse, but I've tried out your patch and can see the generated code…
				ABataevAuthorUnsubmitted Done Reply Inline Actions Yeah, llvm-mca gives throughput 13.5 without being vectorized and 15.5 with vectorized call (the diff is less for newer processors). Looks like another example of a known problem with too optimistic user cost compensation. This must go away once we land the proper implementation of insertelement instruction vectorization but I'll try to prepare a temp patch to try to improve the situation with this temporarily. ABataev: Yeah, llvm-mca gives throughput 13.5 without being vectorized and 15.5 with vectorized call…
	; CHECK-NEXT: [[GEP2:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S2]]			; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E2]], i32 1
				; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
				; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
				; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
				; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
				; CHECK-NEXT: [[GEP1:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP5]]
				; CHECK-NEXT: [[LOAD1:%.]] = load i64, i64 [[GEP1]], align 4
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
				; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
				; CHECK-NEXT: [[GEP2:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP7]]
	; CHECK-NEXT: [[LOAD2:%.]] = load i64, i64 [[GEP2]], align 4			; CHECK-NEXT: [[LOAD2:%.]] = load i64, i64 [[GEP2]], align 4
	; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3			; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
	; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64			; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
	; CHECK-NEXT: [[GEP3:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S3]]			; CHECK-NEXT: [[GEP3:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S3]]
	; CHECK-NEXT: [[LOAD3:%.]] = load i64, i64 [[GEP3]], align 4			; CHECK-NEXT: [[LOAD3:%.]] = load i64, i64 [[GEP3]], align 4
	; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])			; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE		; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SLM		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512

define <8 x float> @sitofp_uitofp(<8 x i32> %a) {		define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
; SSE-LABEL: @sitofp_uitofp(		; SSE-LABEL: @sitofp_uitofp(
; SSE-NEXT: [[A0:%.]] = extractelement <8 x i32> [[A:%.]], i32 0		; SSE-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1		; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2		; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3		; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4		; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float
; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float
; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float
; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float
; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float
; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float
; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float
; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
; SSE-NEXT: ret <8 x float> [[R7]]		; SSE-NEXT: ret <8 x float> [[R7]]
;		;
; SLM-LABEL: @sitofp_uitofp(		; SLM-LABEL: @sitofp_uitofp(
; SLM-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>		; SLM-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>		; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>		; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; SLM-NEXT: ret <8 x float> [[R7]]		; SLM-NEXT: ret <8 x float> [[R7]]
;		;
Show All 33 Lines	;
%r5 = insertelement <8 x float> %r4, float %ab5, i32 5		%r5 = insertelement <8 x float> %r4, float %ab5, i32 5
%r6 = insertelement <8 x float> %r5, float %ab6, i32 6		%r6 = insertelement <8 x float> %r5, float %ab6, i32 6
%r7 = insertelement <8 x float> %r6, float %ab7, i32 7		%r7 = insertelement <8 x float> %r6, float %ab7, i32 7
ret <8 x float> %r7		ret <8 x float> %r7
}		}

define <8 x i32> @fptosi_fptoui(<8 x float> %a) {		define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
; SSE-LABEL: @fptosi_fptoui(		; SSE-LABEL: @fptosi_fptoui(
; SSE-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0		; SSE-NEXT: [[A4:%.]] = extractelement <8 x float> [[A:%.]], i32 4
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5		; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6		; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7		; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32		; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32		; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32
; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32
; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32		; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32		; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32		; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32		; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0		; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1		; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2		; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3		; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1
		; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
		; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2
		; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
		; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4		; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5		; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6		; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7		; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; SSE-NEXT: ret <8 x i32> [[R7]]		; SSE-NEXT: ret <8 x i32> [[R7]]
;		;
; SLM-LABEL: @fptosi_fptoui(		; SLM-LABEL: @fptosi_fptoui(
; SLM-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0		; SLM-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0
Show All 18 Lines
; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3		; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4		; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5		; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6		; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7		; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; SLM-NEXT: ret <8 x i32> [[R7]]		; SLM-NEXT: ret <8 x i32> [[R7]]
;		;
; AVX-LABEL: @fptosi_fptoui(		; AVX-LABEL: @fptosi_fptoui(
; AVX-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0		; AVX-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1		; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2		; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3		; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4		; AVX-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32
; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32
; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32
; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32
; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; AVX-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; AVX-NEXT: ret <8 x i32> [[R7]]		; AVX-NEXT: ret <8 x i32> [[R7]]
;		;
; AVX512-LABEL: @fptosi_fptoui(		; AVX512-LABEL: @fptosi_fptoui(
; AVX512-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>		; AVX512-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>		; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>		; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; AVX512-NEXT: ret <8 x i32> [[R7]]		; AVX512-NEXT: ret <8 x i32> [[R7]]
;		;
▲ Show 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	;
%r6 = insertelement <8 x float> %r5, float %ab6, i32 6		%r6 = insertelement <8 x float> %r5, float %ab6, i32 6
%r7 = insertelement <8 x float> %r6, float %ab7, i32 7		%r7 = insertelement <8 x float> %r6, float %ab7, i32 7
ret <8 x float> %r7		ret <8 x float> %r7
}		}

; Inspired by PR38154		; Inspired by PR38154
define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {		define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(		; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
; SSE-NEXT: [[A0:%.]] = extractelement <4 x i32> [[A:%.]], i32 0		; SSE-NEXT: [[A2:%.]] = extractelement <4 x i32> [[A:%.]], i32 2
; SSE-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3		; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
; SSE-NEXT: [[B0:%.]] = extractelement <8 x i16> [[B:%.]], i32 0		; SSE-NEXT: [[B0:%.]] = extractelement <8 x i16> [[B:%.]], i32 0
; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1		; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
; SSE-NEXT: [[C0:%.]] = extractelement <16 x i8> [[C:%.]], i32 0		; SSE-NEXT: [[C0:%.]] = extractelement <16 x i8> [[C:%.]], i32 0
; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1		; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float		; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float		; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float		; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float
; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float		; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float
; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float		; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float
; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float		; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float
; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float		; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float
; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float		; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0		; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1		; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
		; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
		; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2		; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3		; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4		; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5		; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6		; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7		; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
; SSE-NEXT: ret <8 x float> [[R7]]		; SSE-NEXT: ret <8 x float> [[R7]]
;		;
▲ Show 20 Lines • Show All 103 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE		; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SLM		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512

define <8 x float> @sitofp_uitofp(<8 x i32> %a) {		define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
; SSE-LABEL: @sitofp_uitofp(		; SSE-LABEL: @sitofp_uitofp(
; SSE-NEXT: [[A0:%.]] = extractelement <8 x i32> [[A:%.]], i32 0		; SSE-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1		; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2		; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3		; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4		; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float
; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float
; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float
; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float
; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float
; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float
; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float
; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
; SSE-NEXT: ret <8 x float> [[R7]]		; SSE-NEXT: ret <8 x float> [[R7]]
;		;
; SLM-LABEL: @sitofp_uitofp(		; SLM-LABEL: @sitofp_uitofp(
; SLM-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>		; SLM-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>		; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>		; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; SLM-NEXT: ret <8 x float> [[R7]]		; SLM-NEXT: ret <8 x float> [[R7]]
;		;
Show All 33 Lines	;
%r5 = insertelement <8 x float> %r4, float %ab5, i32 5		%r5 = insertelement <8 x float> %r4, float %ab5, i32 5
%r6 = insertelement <8 x float> %r5, float %ab6, i32 6		%r6 = insertelement <8 x float> %r5, float %ab6, i32 6
%r7 = insertelement <8 x float> %r6, float %ab7, i32 7		%r7 = insertelement <8 x float> %r6, float %ab7, i32 7
ret <8 x float> %r7		ret <8 x float> %r7
}		}

define <8 x i32> @fptosi_fptoui(<8 x float> %a) {		define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
; SSE-LABEL: @fptosi_fptoui(		; SSE-LABEL: @fptosi_fptoui(
; SSE-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0		; SSE-NEXT: [[A4:%.]] = extractelement <8 x float> [[A:%.]], i32 4
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5		; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6		; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7		; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32		; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32		; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32
; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32
; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32		; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32		; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32		; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32		; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0		; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1		; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2		; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3		; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1
		; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
		; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2
		; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
		; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4		; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5		; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6		; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7		; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; SSE-NEXT: ret <8 x i32> [[R7]]		; SSE-NEXT: ret <8 x i32> [[R7]]
;		;
; SLM-LABEL: @fptosi_fptoui(		; SLM-LABEL: @fptosi_fptoui(
; SLM-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0		; SLM-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0
Show All 18 Lines
; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3		; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4		; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5		; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6		; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7		; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; SLM-NEXT: ret <8 x i32> [[R7]]		; SLM-NEXT: ret <8 x i32> [[R7]]
;		;
; AVX-LABEL: @fptosi_fptoui(		; AVX-LABEL: @fptosi_fptoui(
; AVX-NEXT: [[A0:%.]] = extractelement <8 x float> [[A:%.]], i32 0		; AVX-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1		; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2		; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3		; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4		; AVX-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32
; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32
; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32
; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32
; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; AVX-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; AVX-NEXT: ret <8 x i32> [[R7]]		; AVX-NEXT: ret <8 x i32> [[R7]]
;		;
; AVX512-LABEL: @fptosi_fptoui(		; AVX512-LABEL: @fptosi_fptoui(
; AVX512-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>		; AVX512-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>		; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>		; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; AVX512-NEXT: ret <8 x i32> [[R7]]		; AVX512-NEXT: ret <8 x i32> [[R7]]
;		;
▲ Show 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	;
%r6 = insertelement <8 x float> %r5, float %ab6, i32 6		%r6 = insertelement <8 x float> %r5, float %ab6, i32 6
%r7 = insertelement <8 x float> %r6, float %ab7, i32 7		%r7 = insertelement <8 x float> %r6, float %ab7, i32 7
ret <8 x float> %r7		ret <8 x float> %r7
}		}

; Inspired by PR38154		; Inspired by PR38154
define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {		define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(		; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
; SSE-NEXT: [[A0:%.]] = extractelement <4 x i32> [[A:%.]], i32 0		; SSE-NEXT: [[A2:%.]] = extractelement <4 x i32> [[A:%.]], i32 2
; SSE-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3		; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
; SSE-NEXT: [[B0:%.]] = extractelement <8 x i16> [[B:%.]], i32 0		; SSE-NEXT: [[B0:%.]] = extractelement <8 x i16> [[B:%.]], i32 0
; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1		; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
; SSE-NEXT: [[C0:%.]] = extractelement <16 x i8> [[C:%.]], i32 0		; SSE-NEXT: [[C0:%.]] = extractelement <16 x i8> [[C:%.]], i32 0
; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1		; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float		; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float		; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float		; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float
; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float		; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float
; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float		; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float
; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float		; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float
; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float		; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float
; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float		; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0		; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1		; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
		; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
		; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2		; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3		; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4		; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5		; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6		; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7		; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
; SSE-NEXT: ret <8 x float> [[R7]]		; SSE-NEXT: ret <8 x float> [[R7]]
;		;
▲ Show 20 Lines • Show All 103 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 341200

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 341200

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.
ClosedPublic