diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4035,12 +4035,16 @@ // Handle splat and all-constants stores. Also try to vectorize tiny trees // with the second gather nodes if they have less scalar operands rather than // the initial tree element (may be profitable to shuffle the second gather). + SmallVector Mask; if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars) || (VectorizableTree[1]->State == TreeEntry::NeedToGather && VectorizableTree[1]->Scalars.size() < - VectorizableTree[0]->Scalars.size()))) + VectorizableTree[0]->Scalars.size()) || + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && + isShuffle(VectorizableTree[1]->Scalars, Mask)))) return true; // Gathering cost would be too much for tiny trees. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -30,14 +30,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -313,14 +316,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -411,14 +417,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -610,14 +619,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -659,14 +671,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -1182,10 +1197,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) [[ATTR2:#.*]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) [[ATTR2]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]] ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]] ; @@ -1237,14 +1252,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -1270,10 +1288,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) [[ATTR3:#.*]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) [[ATTR3]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]] ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -30,14 +30,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -313,14 +316,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -411,14 +417,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -610,14 +619,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -659,14 +671,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -1182,10 +1197,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #2 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #2 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]] ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]] ; @@ -1237,14 +1252,17 @@ ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1 +; NOACCELERATE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -1270,10 +1288,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #3 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #3 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]] ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -17,12 +17,18 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4 ; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]] -; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4 ; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP5]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4 ; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 ; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -8,30 +8,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float -; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float -; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float -; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float -; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @sitofp_uitofp( @@ -81,26 +62,24 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> ; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -135,30 +114,11 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 -; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 -; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 -; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 -; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> +; AVX-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @fptosi_fptoui( @@ -335,24 +295,24 @@ ; Inspired by PR38154 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) { ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SSE-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2 ; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 ; SSE-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 ; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> ; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float ; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float ; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float ; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -8,30 +8,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float -; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float -; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float -; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float -; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @sitofp_uitofp( @@ -81,26 +62,24 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> ; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -135,30 +114,11 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 -; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 -; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 -; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 -; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> +; AVX-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @fptosi_fptoui( @@ -335,24 +295,24 @@ ; Inspired by PR38154 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) { ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SSE-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2 ; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 ; SSE-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 ; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> ; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float ; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float ; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float ; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4