diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4035,12 +4035,16 @@
   // Handle splat and all-constants stores. Also try to vectorize tiny trees
   // with the second gather nodes if they have less scalar operands rather than
   // the initial tree element (may be profitable to shuffle the second gather).
+  SmallVector<int> Mask;
   if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
       (allConstant(VectorizableTree[1]->Scalars) ||
        isSplat(VectorizableTree[1]->Scalars) ||
        (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
         VectorizableTree[1]->Scalars.size() <
-            VectorizableTree[0]->Scalars.size())))
+            VectorizableTree[0]->Scalars.size()) ||
+       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+        VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
+        isShuffle(VectorizableTree[1]->Scalars, Mask))))
     return true;
 
   // Gathering cost would be too much for tiny trees.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -30,14 +30,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -313,14 +316,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -411,14 +417,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -610,14 +619,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -659,14 +671,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1182,10 +1197,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) [[ATTR2:#.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) [[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
@@ -1237,14 +1252,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1270,10 +1288,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) [[ATTR3:#.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) [[ATTR3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -30,14 +30,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -313,14 +316,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -411,14 +417,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -610,14 +619,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -659,14 +671,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1182,10 +1197,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #2
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #2
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
@@ -1237,14 +1252,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1270,10 +1288,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #3
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #3
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -17,12 +17,18 @@
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]]
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
 ; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -8,30 +8,11 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -81,26 +62,24 @@
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
@@ -135,30 +114,11 @@
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
-; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX512-LABEL: @fptosi_fptoui(
@@ -335,24 +295,24 @@
 ; Inspired by PR38154
 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
 ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2
 ; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
 ; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
 ; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
 ; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
 ; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
 ; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
 ; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
 ; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
 ; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
 ; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -8,30 +8,11 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -81,26 +62,24 @@
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
@@ -135,30 +114,11 @@
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
-; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX512-LABEL: @fptosi_fptoui(
@@ -335,24 +295,24 @@
 ; Inspired by PR38154
 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
 ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2
 ; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
 ; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
 ; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
 ; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
 ; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
 ; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
 ; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
 ; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
 ; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
 ; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4